In my initial analysis of the freshwater-saltwater dataset, I had three vcf files: 1. one generated from all pairwise comparisons of populations, containing SNPs only found in all 16 populations, in 75% of individuals, and with a minor allele frequency of at least 5%. (“separate”) 2. one generated from all pairwise comparisons of populations, containing SNPs found in 4 populations, in 75% of individuals, and with a minor allele frequency of at least 5%. (“P4”) 3. one generated from comparing lumped ‘freshwater’ and ‘saltwater’ populations, containing SNPs found in 50% of individuals and with a minor allele frequency of at least 5%. (“lumped”)
I did the majority of the analyses using set #3, but would like to explore what changes if I use dataset #2. I think #1 is too restrictive. Dataset #2 is in the “subset” dataset, and is what I ran the structure analyses on. This is the dataset I’m going to move forward with for the paper.
Note that in most of these cases the actual analysis will be set to eval=FALSE once I’ve run it once, because then I save the output and only have to read it in, saving compilation time.
source("../../gwscaR/R/gwscaR.R")
source("../../gwscaR/R/gwscaR_plot.R")
source("../../gwscaR/R/gwscaR_utility.R")
source("../../gwscaR/R/gwscaR_fsts.R")
source("../../gwscaR/R/gwscaR_popgen.R")
source("../scripts/002_treemix_plotting_funcs.R")#I've modified these functions
library(knitr)
pop.list<-c("TXSP","TXCC","TXFW","TXCB","LAFW","ALST","ALFW","FLSG","FLKB",
"FLFD","FLSI","FLAB","FLPB","FLHB","FLCC","FLLG")
pop.labs<-c("TXSP","TXCC","TXFW","TXCB","LAFW","ALST","ALFW","FLSG","FLKB",
"FLFD","FLSI","FLAB","FLPB","FLHB","FLCC","FLFW")
fw.list<-c("TXFW","LAFW","ALFW","FLLG")
sw.list<-c("TXSP","TXCC","TXCB","ALST","FLSG","FLKB",
"FLFD","FLSI","FLAB","FLPB","FLHB","FLCC")
lgs<-c("LG1","LG2","LG3","LG4","LG5","LG6","LG7","LG8","LG9","LG10","LG11",
"LG12","LG13","LG14","LG15","LG16","LG17","LG18","LG19","LG20","LG21",
"LG22")
lgn<-seq(1,22)
all.colors<-c(rep("black",2),"#2166ac","black","#2166ac","black","#2166ac",
rep("black",8),"#2166ac")
#grp.colors<-c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ffff33','#f781bf')
grp.colors<-c('#762a83','#af8dc3','#e7d4e8','#d9f0d3','#7fbf7b','#1b7837')
pwise.fst.all<-read.table("stacks/fwsw_fst_summary.txt",header=T,row.names=1,sep='\t')
#pwise.fst.all<-rbind(pwise.fst.all,rep(NA,ncol(pwise.fst.all)))
rownames(pwise.fst.all)<-pop.labs
colnames(pwise.fst.all)<-pop.labs
pwise.fst.sub<-read.table("stacks/fwsw_fst_summary_subset.txt",header=T,row.names=1,sep='\t')
colnames(pwise.fst.sub)<-pop.labs
rownames(pwise.fst.sub)<-pop.labs
print(paste("Average Pairwise Fst is ",mean(pwise.fst.sub[upper.tri(pwise.fst.sub)]),sep=""))
ped.sub<-read.table("stacks/subset.ped",header=F)
ped.sub$V1<-gsub("sample_(\\w{4})\\w+.*","\\1",ped.sub$V2)
map.sub<-read.table("stacks/subset.map",header = F,stringsAsFactors = F)
map.sub$Locus<-paste(gsub("(\\d+)_\\d+","\\1",map.sub$V2),(as.numeric(map.sub$V4)+1),sep=".")
colnames(ped.sub)<-c("Pop","IID","","","","Phenotype","","",map.sub$Locus)
This P4/subset dataset has 9820 SNPs from 9820 RAD loci, from 698 indivudals in 16 populations.
The first thing to do is to create a vcf file using the subset parameters. I’ve already got a whitelist of loci in the subsetted dataset, so I need to run populations -b 2 -W fwsw_results/subset.whitelist.txt -P fwsw_results/stacks -M fwsw_pops_map.txt --vcf, which I did on 2017-12-18 on silivren-lond. I then re-named it to p4.vcf (and the other output files).
vcf<-parse.vcf("p4.upd.vcf") #this is the smaller dataset
vcf$SNP<-paste(vcf$`#CHROM`,vcf$POS,sep=".")
scaffs<-levels(as.factor(vcf[,1]))
scaffs[1:22]<-lgs
scaff.starts<-tapply(vcf$POS,vcf$`#CHROM`,max)
scaff.starts<-data.frame(rbind(cbind(names(scaff.starts),scaff.starts)),stringsAsFactors = F)
locus.info<-c(colnames(vcf[1:9]),"SNP")
The vcf file contains 9638 SNPs from 9638 RAD loci.
Choose a subset of the SNPs to re-use. [Do I need to do this?]
chosen.snps<-choose.one.snp(vcf)$SNP
write.table(chosen.snps,"chosen.all.snps.txt",quote=F)
chosen.snps<-unlist(read.table("chosen.all.snps.txt"))
There are 2546 SNPs that I’ll use from the vcf file, from 9638 RAD loci.
The first figure in the paper is a map of the collection sites.
jpeg("all_sites_map.jpg", res=300, height=7,width=14, units="in")
pdf("all_sites_map.pdf",height=7,width=14)
par(oma=c(0,0,0,0),mar=c(0,0,0,0),pin=c(7,7))
map("worldHires", "usa",xlim=c(-100,-76), ylim=c(24,32),
col="gray90", mar=c(0,0,0,0),fill=TRUE, res=300,myborder=0)
map("worldHires", "mexico",xlim=c(-100,-76), ylim=c(24,32),
col="gray95", fill=TRUE, add=TRUE)
points(mar.coor$lon, mar.coor$lat, col="black", cex=2, pch=19)
points(-1*fw.coor$lon, fw.coor$lat, col="cornflowerblue", cex=2, pch=18)
abline(h=c(25,30,35),lty=3)
abline(v=c(-80,-85,-90,-95,-100),lty=3)
text(x=c(-99.5,-99.5),y=c(25,30),c("25N","30N"),cex=1.75)
text(x=c(-80,-85,-90,-95),y=rep(31.8,4),c("80W","85W","90W","95W"),cex=1.75)
text(y=26,x=-90,"Gulf of Mexico",cex=1.75)
text(y=25.5,x=-98.5,"Mexico",cex=1.75)
text(x=-91,y=31,"USA",cex=1.75)
text(x=-78,y=29.5,"Atlantic Ocean",cex=1.75)
text(x=-96.5,y=26,"TXSP",font=2,cex=1.75)
text(x=-96.7,y=27.2,"TXCC",font=2,cex=1.75)
text(x=-96,y=28.3,"TXFW",font=2,col="cornflowerblue",cex=1.75)
text(x=-94.7,y=29,"TXCB",font=2,cex=1.75)
text(x=-90.2,y=30.3,"LAFW",font=2,col="cornflowerblue",cex=1.75)
text(x=-88,y=30,"ALST",font=2,cex=1.75)
text(x=-87,y=30.75,"ALFW",font=2,col="cornflowerblue",cex=1.75)
text(x=-85,y=29.4,"FLSG",font=2,cex=1.75)
text(x=-83.5,y=29.2,"FLKB",font=2,cex=1.75)
text(x=-83.2,y=27.6,"FLFD",font=2,cex=1.75)
text(x=-82.2,y=26,"FLSI",font=2,cex=1.75)
text(x=-80,y=24.8,"FLAB",font=2,cex=1.75)
text(x=-79.3,y=26.8,"FLPB",font=2,cex=1.75)
text(x=-79.5,y=27.2,"FLHB",font=2,cex=1.75)
text(x=-80.2,y=28,"FLCC",font=2,cex=1.75)
text(x=-80.9,y=29.5,"FLFW",font=2,col="cornflowerblue",cex=1.75)
dev.off()
Figure 1. Map of collection sites
The second figure in the paper is showing population structure, using STRUCTURE, adegenet, and PCAdapt. These analyses were run as exactly written in 002_fwsw_analysis.R, so I won’t reproduce that code here.
Now for a combined figure:
dapc1<-readRDS("dapc1.RDS")
ind.names<-rownames(dapc1$ind.coord)
pop<-substr(ind.names, 8,11)
colors<-pop
colors[colors %in% sw.list]<-"black"
colors[colors %in% fw.list]<-"#2166ac"
pop.pch<-pop
pop.pch[pop.pch=="TXSP"]<-"0"
pop.pch[pop.pch=="TXCC"]<-"1"
pop.pch[pop.pch=="TXFW"]<-"21"
pop.pch[pop.pch=="TXCB"]<-"2"
pop.pch[pop.pch=="LAFW"]<-"24"
pop.pch[pop.pch=="ALST"]<-"3"
pop.pch[pop.pch=="ALFW"]<-"23"
pop.pch[pop.pch=="FLSG"]<-"4"
pop.pch[pop.pch=="FLKB"]<-"5"
pop.pch[pop.pch=="FLFD"]<-"6"
pop.pch[pop.pch=="FLSI"]<-"7"
pop.pch[pop.pch=="FLAB"]<-"9"
pop.pch[pop.pch=="FLPB"]<-"10"
pop.pch[pop.pch=="FLHB"]<-"11"
pop.pch[pop.pch=="FLCC"]<-"12"
pop.pch[pop.pch=="FLLG"]<-"22"
#pop plot info
ppi<-data.frame(Pop=pop.labs,cols = all.colors,pch=c(0,1,21,2,24,3,23,4,5,6,7,9,10,11,12,22))
da<-data.frame(Individual=rownames(dapc1$ind.coord),Pop=substr(rownames(dapc1$ind.coord),8,11),
LD1=dapc1$ind.coord[,1],LD2=dapc1$ind.coord[,2],
LD3=dapc1$ind.coord[,3],LD4=dapc1$ind.coord[,4],LD5=dapc1$ind.coord[,5],
Group=dapc1$grp, GrpName=names(dapc1$grp),
stringsAsFactors = F) #include grpname to check
da$Pop[da$Pop == "FLLG"]<-"FLFW"
da$colors<-da$Pop
for(i in 1:nrow(da)){
da[i,"colors"]<-as.character(ppi[ppi$Pop %in% da[i,"Pop"],"cols"])
}
da$pch<-da$Pop
for(i in 1:nrow(da)){
da[i,"pch"]<-as.numeric(ppi[ppi$Pop %in% da[i,"Pop"],"pch"])
}
npop<-length(pop.list)
pseq<-1:npop
m<-matrix(c(1:32,rep(33,7),rep(34,7),rep(0,2),
rep(35,7),rep(36,7),rep(0,2)),
nrow=4,ncol=npop,byrow = T)
jpeg("combinedStructure.jpeg",res=300,height=8,width=8,units="in")
layout(mat=m,heights=c(1,1,6,6))
#STRUCTURE
par(oma=c(1.5,3.5,1,1),mar=c(1,0,0,0))
plotting.structure(structure.k2,2,pop.list, make.file=FALSE, xlabcol = all.colors,plot.new=F,
colors=grp.colors[c(1,6)],xlabel=F,
ylabel=expression(atop(italic(K)==2,358.9)))
plotting.structure(structure.k6,2,pop.labs, make.file=FALSE,
plot.new=F,colors=grp.colors,xlabel=T,
xlabcol = all.colors,
ylabel=expression(atop(italic(K)==6,326.1)))
#PCADAPT
par(mar=c(2,2,2,2))
plot(pa$scores[,1],pa$scores[,2],col=alpha(pap$cols,0.5),bg=alpha(pap$cols,0.75),
pch=as.numeric(pap$pch), cex=2,bty="L",xlab="",ylab="",cex.axis=1.5)
#points(pa$scores[grp=="freshwater",1],pa$scores[grp=="freshwater",2],
# col=alpha(pap$cols[pap$grp=="freshwater"],0.5),
# bg=alpha(pap$cols[pap$grp=="freshwater"],0.75),pch=as.numeric(pap$pch[pap$grp=="freshwater"]),
# cex=2) #to highlight the fw pops
mtext(paste("PC1 (",pa.props[1],"%)",sep=""),1,line = 2.5,cex=1)
mtext(paste("PC2 (",pa.props[2],"%)",sep=""),2,line = 2.5,cex=1)
plot(pa$scores[,3],pa$scores[,4],col=alpha(pap$cols,0.5),bg=alpha(pap$cols,0.75),pch=as.numeric(pap$pch),
cex=2, bty="L",xlab="",ylab="",cex.axis=1.5)
#points(pa$scores[grp=="freshwater",3],pa$scores[grp=="freshwater",4],
# col=alpha(pap$cols[pap$grp=="freshwater"],0.5),
# bg=alpha(pap$cols[pap$grp=="freshwater"],0.75),pch=as.numeric(pap$pch[pap$grp=="freshwater"]),
# cex=2)
mtext(paste("PC3 (",pa.props[3],"%)",sep=""),1,line = 2.5,cex=1)
mtext(paste("PC4 (",pa.props[4],"%)",sep=""),2,line = 2.5,cex=1)
# plot(pa$scores[,5],pa$scores[,6],col=alpha(pap$cols,0.5),bg=alpha(pap$cols,0.75),pch=as.numeric(pap$pch),
# cex=1.5,bty="L",xlab="",ylab="")
# points(pa$scores[grp=="freshwater",5],pa$scores[grp=="freshwater",6],
# col=alpha(pap$cols[pap$grp=="freshwater"],0.5),
# bg=alpha(pap$cols[pap$grp=="freshwater"],0.75),pch=as.numeric(pap$pch[pap$grp=="freshwater"]),
# cex=1.5)
# mtext(paste("PC5 (",pa.props[5],"%)",sep=""),1,line = 2,cex=0.75)
# mtext(paste("PC6 (",pa.props[6],"%)",sep=""),2,line = 2,cex=0.75)
# ADEGENET
par(mar=c(2,2,2,2))
plot(da$LD1,da$LD2,col=alpha(da$colors,0.5),pch=as.numeric(da$pch),cex=2,lwd=1.3,
bg=alpha(da$colors,0.75),xlab="",ylab="",xlim=c(-10,25),ylim=c(-10,15),bty="n",cex.axis=1.5)
par(lwd=3,bty='n')
s.class(dapc1$ind.coord,fac=factor(da$Group), clabel=0,cstar=0,cellipse=2.5,
addaxes = F,pch="",grid=F,axesel=F,add.plot = T,col=grp.colors[c(1,5,6,3,2,4)],xlim=c(-10,25),ylim=c(-10,15))
mtext(paste("DAPC 1 (", round(dapc1$eig[1]/sum(dapc1$eig)*100, 2), "%)", sep=""),
1, line = 2.5,cex=1)
mtext(paste("DAPC 2 (", round(dapc1$eig[2]/sum(dapc1$eig)*100, 2), "%)", sep=""),
2, line = 2.1,cex=1)
plot(da$LD3,da$LD4,col=alpha(da$colors,0.5),pch=as.numeric(da$pch),cex=2,lwd=1.3,
bg=alpha(da$colors,0.75),xlab="",ylab="",xlim=c(-10,10),ylim=c(-10,5),cex.axis=1.5,bty="n")
par(lwd=3,bty='n')
s.class(dapc1$ind.coord[,3:4],fac=factor(da$Group), clabel=0,cstar=0,cellipse=2.5,
addaxes = F,pch="",grid=F,axesel=F,add.plot = T,col=grp.colors[c(1,5,6,3,2,4)],xlim=c(-10,10),ylim=c(-10,5))
mtext(paste("DAPC 3 (", round(dapc1$eig[3]/sum(dapc1$eig)*100, 2), "%)", sep=""),
1, line = 2.5,cex=1)
mtext(paste("DAPC 4 (", round(dapc1$eig[4]/sum(dapc1$eig)*100, 2), "%)", sep=""),
2, line = 2.1,cex=1)
# plot(da$LD4,da$LD5,col=alpha(da$colors,0.5),pch=as.numeric(da$pch),cex=2,lwd=1.3,
# bg=alpha(colors,0.25),xlab="",ylab="",xlim=c(-5,15),ylim=c(-10,5))
# par(lwd=3,bty='n')
# s.class(dapc1$ind.coord[,4:5],fac=factor(da$Group), clabel=0,cstar=0,cellipse=2.5,
# addaxes = F,pch="",grid=F,axesel=F,add.plot = T,col=grp.colors[c(3,5,6,4,2,1)],xlim=c(-5,15),ylim=c(-10,5))
# mtext(paste("DAPC 4 (", round(dapc1$eig[4]/sum(dapc1$eig)*100, 2), "%)", sep=""),
# 1, line = 2,cex=0.75)
# mtext(paste("DAPC 5 (", round(dapc1$eig[5]/sum(dapc1$eig)*100, 2), "%)", sep=""),
# 2, line = 2,cex=0.75)
par(fig = c(0, 1, 0, 1), oma=c(2,1,0,1), mar = c(0, 0, 0, 0), new = TRUE,
cex=1,lwd=1.3)
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n")
legend(x=0.75,y=0.5, legend=ppi$Pop, pch=as.numeric(ppi$pch), pt.cex=1.5,cex=1,
col=alpha(ppi$cols, 0.5),pt.bg=alpha(ppi$cols,0.75), ncol=1,bty='n')
#add delta K
text(x=-1.02,y=0.7,srt=90,labels = expression(Delta~italic(K)==""))
dev.off()
Figure 2. Population Structure
I don’t need to re-calculate pairwise Jost’s D, and Fsts using the P4 (or “subset”) dataset, so I can just read in those files. But I do need to run Treemix and PopTree2.
To run treemix, I follow the following steps:
-m.All of these require setting a root, which is FLPB based on previous trees.
First, I need to create a file in the correct format, which uses the vcf file:
treemix.name<-"treemix/p4_treemix"
treemix.prefix<-"treemix/p4_"
poporder.file<-"treemix/poporder"
fst.tree.name<-as.character("ALLfst_cov_heatmap.png")
tm.fwsw<-treemix.from.vcf(vcf,pop.list)
write.table(tm.fwsw,treemix.name,col.names=TRUE,row.names=FALSE,quote=F,sep=' ')
#then in unix: gzip -c treemix.name > treemix.name.gz
Then, in unix, I need to run gzip -c treemix/p4_treemix > treemix/p4_treemix.gz. Now I can run scripts/run_treemix.sh, which implements steps 1 and 2, and which I need to run in Ubuntu. Note that there are a lot of “no counts” warnings from treemix. Also, that it runs very quickly
After that, I can evaluate the different outcomes.
setwd("treemix")
source("../../scripts/002_treemix_plotting_funcs.R")#I've modified these functions
poporder<-c("TXSP","TXCC","TXFW","TXCB","LAFW","ALST",
"ALFW","FLSG","FLKB","FLFD","FLSI","FLAB",
"FLPB","FLHB","FLCC","FLLG")
colors<-poporder
colors[colors %in% "FLLG"]<-grp.colors[6]
colors[colors %in% c("FLPB","FLHB","FLCC")]<-grp.colors[6]
colors[colors %in% c("FLAB")]<-grp.colors[5]
colors[colors %in% c("FLSI","FLFD","FLKB","FLSG")]<-grp.colors[3]
colors[colors %in% c("ALST","ALFW","LAFW")]<-grp.colors[2]
colors[colors %in% c("TXSP","TXCC","TXFW","TXCB")]<-grp.colors[1]
write.table(cbind(poporder,colors),"poporder",quote=F,sep='\t')
setwd("../")
m0<-treemix.cov.plot(paste(treemix.prefix,"k100bFLPBr",sep=""),poporder,split=c(1,1,3,2),more=TRUE)
m1<-treemix.cov.plot(paste(treemix.prefix,"k100bFLPBrm1",sep=""),poporder,split=c(2,1,3,2),more=TRUE)
m2<-treemix.cov.plot(paste(treemix.prefix,"k100bFLPBrm2",sep=""),poporder,split=c(3,1,3,2),more=TRUE)
m3<-treemix.cov.plot(paste(treemix.prefix,"k100bFLPBrm3",sep=""),poporder,split=c(1,2,3,2),more=TRUE)
m4<-treemix.cov.plot(paste(treemix.prefix,"k100bFLPBrm4",sep=""),poporder,split=c(2,2,3,2),more=TRUE)
m5<-treemix.cov.plot(paste(treemix.prefix,"k100bFLPBrm5",sep=""),poporder,split=c(3,2,3,2),more=FALSE)
par(mfrow=c(2,3))
r0<-plot_resid(paste(treemix.prefix,"k100bFLPBr",sep=""),poporder.file)
r1<-plot_resid(paste(treemix.prefix,"k100bFLPBrm1",sep=""),poporder.file)
r2<-plot_resid(paste(treemix.prefix,"k100bFLPBrm2",sep=""),poporder.file)
r3<-plot_resid(paste(treemix.prefix,"k100bFLPBrm3",sep=""),poporder.file)
r4<-plot_resid(paste(treemix.prefix,"k100bFLPBrm4",sep=""),poporder.file)
r5<-plot_resid(paste(treemix.prefix,"k100bFLPBrm5",sep=""),poporder.file)
Population pairs that are ‘too far apart’ on the tree (have high error estimates) are ones that are likely candidates for gene flow - and these are the squares with dark greens, blues, and black. These are LAFW-TXFW and ALFW-TXFW in almost all of the SE graphs
par(mfrow=c(2,3),mar=c(1,1,1,1),oma=c(1,1,1,1))
t0<-plot_tree(paste(treemix.prefix,"k100bFLPBr",sep=""),plotmig = F,plus=0.05,scale=F,mbar=F)
t1<-plot_tree(paste(treemix.prefix,"k100bFLPBrm1",sep=""),plus=0.05,scale=F,mbar=F)
t2<-plot_tree(paste(treemix.prefix,"k100bFLPBrm2",sep=""),plus=0.05,scale=F,mbar=F)
t3<-plot_tree(paste(treemix.prefix,"k100bFLPBrm3",sep=""),plus=0.05,scale=F,mbar=F)
t4<-plot_tree(paste(treemix.prefix,"k100bFLPBrm4",sep=""),plus=0.05,scale=F,mbar=F)
t5<-plot_tree(paste(treemix.prefix,"k100bFLPBrm5",sep=""),plus=0.05,scale=F,mbar=F)
Look at the p-values: do migration events always improve the fit of the data?
tree0<-read.table(gzfile(paste(treemix.prefix,"k100bFLPBr.treeout.gz",sep="")), as.is = T, comment.char = "", quote = "")
tree1<-read.table(gzfile(paste(treemix.prefix,"k100bFLPBrm1.treeout.gz",sep="")), as.is = T, comment.char = "", quote = "",skip=1)
tree2<-read.table(gzfile(paste(treemix.prefix,"k100bFLPBrm2.treeout.gz",sep="")), as.is = T, comment.char = "", quote = "",skip=1)
tree3<-read.table(gzfile(paste(treemix.prefix,"k100bFLPBrm3.treeout.gz",sep="")), as.is = T, comment.char = "", quote = "",skip=1)
tree4<-read.table(gzfile(paste(treemix.prefix,"k100bFLPBrm4.treeout.gz",sep="")), as.is = T, comment.char = "", quote = "",skip=1)
tree5<-read.table(gzfile(paste(treemix.prefix,"k100bFLPBrm5.treeout.gz",sep="")), as.is = T, comment.char = "", quote = "",skip=1)
tree1[,4]
## [1] 9.18188e-12
tree2[,4]
## [1] "6.61839e-05" "<2.22507e-308"
tree3[,4]
## [1] "1.02604e-05" "<2.22507e-308" "1.88599e-10"
tree4[,4]
## [1] "<2.22507e-308" "<2.22507e-308" "<2.22507e-308" "2.89325e-05"
tree5[,4]
## [1] "6.55032e-15" "4.94959e-09" "<2.22507e-308" "0.00690872"
## [5] "<2.22507e-308"
All of the added migration edges improve the fit. Evaluating both the residual and migration edge plots, we can see that four migration edges reduces the error between LAFW & TXFW and ALFW & TXFW. The largest SE is between FLFW (FLLG) and itself, and secondarily between LAFW and itself. This means that those branches are surprisingly long, which I am comfortable accepting. The strongest migration edge, between the branch from the Atlantic Florida to Gulf Florida populations -> FLAB, is unsurprising because that is an intermediate location between the Atlantic and Gulf coasts. In some of the structure analyses it clusters with the Gulf coast and in others it is its own admixed group - supporting the tree structure here.
The other migration events are somewhat more surprising. We will evaluate those using the three and four population analyses.
First, let’s read in the three and four population analysis results. These tested all possible three- and four-population tree groups to see whether they pass a test of ‘treeness’. If not, their p-value will be small, and they don’t form a nice tree.
f3.name<-"treemix/p4_threepop.txt"
f4.name<-"treemix/p4_fourpop.txt"
#extract the relevant lines
extract.fs<-function(filename,pat="^[A-Z]{4};"){
f<-readLines(filename)
fmatch<-f[grep(pat,f)]
f<-read.table(text=fmatch)
colnames(f)<-c("pops","f","SE","Z")
return(f)
}
f3<-extract.fs(f3.name)
f4<-extract.fs(f4.name,pat="^[A-Z]{4},")
#add p-values
f3$p<-pnorm(f3$Z)
f4$p<-pnorm(f4$Z)
Now we can investigate the migration edges added to the trees.
This least surprising migration edge would be expected to show a signature of admixture. I’ll check the treeness of FLHB;FLSI,FLAB and FLHB,FLCC;FLSI,FLAB. For comparison, I can look at FLHB;FLSI,FLFD and FLHB,FLCC;FLSI,FLFD
checks3<-c("FLHB;FLSI,FLAB","FLHB;FLFD,FLSI")
f3[f3$pops %in% checks3,]
## pops f SE Z p
## 1583 FLHB;FLFD,FLSI 0.0250745 0.000545858 45.9359 1
## 1625 FLHB;FLSI,FLAB 0.0224990 0.000508579 44.2390 1
checks4<-c("FLSI,FLAB;FLHB,FLCC","FLFD,FLSI;FLHB,FLCC")
f4[f4$pops %in% checks4,]
## pops f SE Z p
## 5379 FLFD,FLSI;FLHB,FLCC -0.000159989 6.74083e-05 -2.37343 8.811867e-03
## 5425 FLSI,FLAB;FLHB,FLCC -0.000470815 8.68360e-05 -5.42188 2.948773e-08
They pass the three-population tests but not the four-population tests.
To investigate this migration edge, I need to evaluate whether a [TXFW[TXCB,TXCC]] or [TXFW[TXCB,TXSP]] topology is an appropriate tree.
checks<-c("TXFW;TXCC,TXCB",
"TXFW;TXSP,TXCB")
f3[f3$pops %in% checks,]
## pops f SE Z p
## 44 TXFW;TXSP,TXCB 0.0294602 0.00227983 12.9221 1
## 318 TXFW;TXCC,TXCB 0.0304362 0.00230450 13.2073 1
Neither of these fail the treeness test, so TXFW is essentially functioning as an effective outgroup/more ancestral population. Let’s check if a four population tree also makes sense.
f4[f4$pops %in% "TXSP,TXCC;TXFW,TXCB",]
## pops f SE Z p
## 2 TXSP,TXCC;TXFW,TXCB 0.000976003 0.000160092 6.09653 1
This also passes the treeness test - so this isn’t an actual admixture event, more of demonstrating shared ancestry.
This migration edge would suggest a three-population structure of [TXFW[ALFW,LAFW]] or a four-population structure of [TXFW,TXCB[LAFW,ALFW]] or [TXFW,ALST[LAFW,ALFW]]
checks<-c("TXFW,TXCB;LAFW,ALFW",
"TXFW,ALST;LAFW,ALFW")#to get the correct order of populations I had to
#use grep and manually try a few different ones
f4[f4$pops %in% checks,]
## pops f SE Z p
## 2463 TXFW,TXCB;LAFW,ALFW -0.00140534 0.000542958 -2.58830 4.822547e-03
## 2657 TXFW,ALST;LAFW,ALFW -0.00416643 0.000526403 -7.91491 1.237160e-15
f3[f3$pops == "TXFW;LAFW,ALFW",]
## pops f SE Z p
## 630 TXFW;LAFW,ALFW 0.0349272 0.00105283 33.1745 1
The three-population test does not fail the treeness test but the four-population tests do fail the treeness tests. This suggests that TXFW functions as an outgroup to the LAFW and ALFW - they are perhaps derived from the TXFW population rather than experiencing current admixture.
FLLG is actually FLFW, and there’s a migration edge from it to the Alabama/Louisiana clade and the Texas clade. So, we can test FLLG;ALST,TXFW, FLLG;ALFW,TXCB, FLLG;ALFW,TXFW or FLLG;ALST,TXCC - see if the migration is specifically to freshwater populations or not.
checks<-c("FLLG;TXFW,ALST","FLLG;TXCB,ALFW","FLLG;TXCB,LAFW",#mix of freshwater
"FLLG;TXFW,ALFW","FLLG;TXFW,LAFW", #all freshwater
"FLLG;TXCC,ALST") #no freshwater
f3[f3$pops %in% checks,]
## pops f SE Z p
## 452 FLLG;TXCC,ALST 0.0843051 0.00214158 39.3658 1
## 655 FLLG;TXFW,LAFW 0.0897111 0.00256943 34.9147 1
## 686 FLLG;TXFW,ALST 0.0833365 0.00215944 38.5917 1
## 713 FLLG;TXFW,ALFW 0.0928283 0.00279453 33.2179 1
## 853 FLLG;TXCB,LAFW 0.0857054 0.00220127 38.9345 1
## 911 FLLG;TXCB,ALFW 0.0874172 0.00219662 39.7963 1
These all pass the treeness test.
treemix.file<-as.character("treemix/p4_k100bFLPBr.cov.gz")
tm.vertices<-"treemix/p4_k100bFLPBrm4.vertices.gz"
tm.plot<-"treemix/p4_treemix_m4_FLPB.png"
tm.tree<-"treemix/p4_k100bFLPBrm4"
For the PopTree2 analysis, I need to convert the vcf file to genepop format. I did this using PGDSpider2. Then I ran PopTree2 on Windows10, using Da to calculate neighbor-joining trees and using 1000 bootstrap replicates.
Or, if that doesn’t work,
remove.missing.data<-function(vcf, pop.list){
exclude<-NULL
for(i in 1:nrow(vcf))
{
vcf.row<-vcf[i,colnames(vcf) != "SNP"]#remove this if it exists
missingness<-unlist(lapply(pop.list,function(pop){
pop.vcf<-vcf.row[,grep(pop,colnames(vcf.row))]
missing<-length(grep("\\.\\/\\.",pop.vcf))
prop.missing<-missing/length(pop.vcf)
return(prop.missing)
}))
if(length(missingness[missingness==1])>0){
print(paste("Row ", i, " is has no data for pop ", pop.list[which(missingness==1)]))
exclude<-c(exclude,i)
}
}
if(!is.null(exclude)){
return(vcf[-exclude,])
}else{
return(vcf)
}
}
gpop.name<-"poptree/p4.genepop"
sub.prefix<-"poptree/p4_"
vcf<-remove.missing.data(vcf, pop.list)
for(i in 1:10){
rowsub<-sample(nrow(vcf),1000,replace = FALSE)
gpopsub<-vcf2gpop(vcf[rowsub,colnames(vcf)!="SNP"],pop.list,paste(sub.prefix,i,".genepop",sep=""))
}
gpop<-vcf2gpop(vcf[,colnames(vcf)!="SNP"],pop.list,gpop.name)
#then run poptree on all of them
And then run poptree. Poptree ran on the full dataset as well as the subsets of 1000 SNPs. Did they all provide similar results? What does the consensus tree look like?
poptree.dir<-"poptree/"
poptree.prefix<-"p4"
poptree.png<-"p4.poptree.png"
poptree.files<-list.files(path = poptree.dir,pattern=paste(poptree.prefix,".*.nwk",sep=""))
poptree.files<-lapply(poptree.files,function(x){ paste("poptree",x,sep="/")})
poptrees<-lapply(poptree.files,read.tree)
con.poptree<-consensus(poptrees)
con.poptree$tip.label[con.poptree$tip.label=="FLLG"]<-"FLFW"
clcolr <- rep("black", dim(con.poptree$edge)[1])
#clcolr[c(12,13,14,24)]<-all.colors[3]
#png(paste(poptree.dir,poptree.prefix,".consensus.png",sep=""),height=7,width=7,units="in",res=300)
#dev.off()
png(paste(poptree.dir,poptree.prefix,".png",sep=""),height=10,width=10,units="in",res=300)
par(mfrow=c(3,4),oma=c(1,1,1,1),mar=c(1,1,1,1))
for(i in 1:length(poptrees)){
plot.phylo(poptrees[[i]],cex=1.5)
mtext(poptree.files[i],3)
}
plot.phylo(con.poptree,tip.color = c(rep(grp.colors[6],4),grp.colors[5],
rep(grp.colors[1],4),rep(grp.colors[2],3),
rep(grp.colors[3],4)),
edge.color = clcolr,edge.width = 2,cex=1,font=1)
mtext("Consensus")
dev.off()
## png
## 2
All Trees
Based on this, I’m going to move forward just with the full dataset (which includes bootstrap values).
#just the full subset tree
pt.subtree<-poptrees[[1]]
pt.subtree$tip.label[pt.subtree$tip.label=="FLLG"]<-"FLFW"
pt.colors<-pt.subtree$tip.label
pt.colors[pt.colors %in% "FLFW"]<-grp.colors[6]
pt.colors[pt.colors %in% c("FLPB","FLHB","FLCC")]<-grp.colors[6]
pt.colors[pt.colors %in% c("FLAB")]<-grp.colors[5]
pt.colors[pt.colors %in% c("FLSI","FLFD","FLKB","FLSG")]<-grp.colors[3]
pt.colors[pt.colors %in% c("ALST","ALFW","LAFW")]<-grp.colors[2]
pt.colors[pt.colors %in% c("TXSP","TXCC","TXFW","TXCB")]<-grp.colors[1]
clcolr <- rep("black", dim(pt.subtree$edge)[1])
clcolr[c(6,19,20,21,22)]<-all.colors[3]
png(poptree.png,height=7,width=7,units="in",res=300)
plot.phylo(pt.subtree,tip.color = pt.colors,
edge.color = clcolr,edge.width = 2,label.offset = 0.0015)
dev.off()
## png
## 2
PopTree
To get the Poptree distance matrix, I copied and pasted the distance matrix in the p4.out file into a new file and saved it as p4.distance.out. I had to first save each part of the matrix as text files, open them in Excel to standardize the spacings, merge them, and then save them.
pt.dist<-as.matrix(read.table("poptree/p4.distance.out",header=T,row.names=1,sep='\t'))
jostpw<-as.matrix(read.table("Subset.JostsD.tsv",header=T,sep='\t'))
pwise.fst.raw<-read.table("stacks/fwsw_fst_summary_subset.txt",header=T,row.names=1,sep='\t')#p4_fst_summary.txt
pwise.fst<-matrix(nrow=length(pop.list),ncol=length(pop.list))
dimnames(pwise.fst)[[1]]<-dimnames(pwise.fst)[[2]]<-pop.list
for(pop1 in 1:length(pop.list)){
for(pop2 in 1:length(pop.list)){
if(pop1 != pop2){
raws<-c(pwise.fst.raw[pop.list[pop1],pop.list[pop2]],pwise.fst.raw[pop.list[pop2],pop.list[pop1]])
pwise.fst[pop.list[pop1],pop.list[pop2]]<-raws[!is.na(raws)]
}
}
}
dimnames(pwise.fst)[[1]]<-dimnames(pwise.fst)[[2]]<-pop.labs
pwise.fst[lower.tri(pwise.fst)]<-NA
heatmaps.name<-"p4.heatmap.png"
colors<-c("blue","yellow","red")
pal<-colorRampPalette(colors)
ncol=80
cols<-pal(ncol)
cov<-read.table(gzfile(treemix.file), as.is = T, head = T, quote = "", comment.char = "")
#reorder
covplot <- data.frame(matrix(nrow = nrow(cov), ncol = ncol(cov)))
for(i in 1:length(pop.list)){
for( j in 1:length(pop.list)){
covplot[i, j] = cov[which(names(cov)==pop.list[i]), which(names(cov)==pop.list[j])]
rownames(covplot)[i]<-pop.list[i]
colnames(covplot)[j]<-pop.list[j]
}
}
cp<-as.matrix(covplot)
cp[lower.tri(cp)]<-NA
cp[upper.tri(cp)]<-covplot[upper.tri(covplot)]
colnames(cp)<-pop.labs
rownames(cp)<-pop.labs
cp.lv<-levelplot(cp,col.regions=cols,alpha.regions=0.7,
scales = list(x=list(rot=90),tck = 0),xlab="",ylab="")
dimnames(pt.dist)[[1]]<-dimnames(pt.dist)[[2]]<-pop.labs
dimnames(jostpw)[[1]]<-dimnames(jostpw)[[2]]<-pop.labs
colors<-c("black","darkgrey","grey","lightgrey","cornflowerblue")
pal<-colorRampPalette(colors)
ncol=80
cols<-pal(ncol)
rev.colors<-c("cornflowerblue","lightgrey","grey","darkgrey","black")
rev.pal<-colorRampPalette(rev.colors)
rev.cols<-rev.pal(ncol)
hm.height<-list(x=3.8,units="in")#2.2
hm.width<-list(x=3.9,units="in")#2.4 in RStudio
png(heatmaps.name,height=11,width=11,units="in",res=300)
fst.lv<-levelplot(as.matrix(pwise.fst),col.regions=cols,alpha.regions=0.7,
scales = list(x=list(rot=90),tck = 0),xlab="",ylab="")
print(fst.lv,split=c(1,1,2,2),more=TRUE,panel.width=hm.width,
panel.height=hm.height,cex=2)
trellis.focus("legend", side="right", clipp.off=TRUE, highlight=FALSE)
grid.text(expression(italic(F)[ST]), 0.2, 0, hjust=0.5, vjust=1.2,gp=gpar(cex=0.75))
trellis.unfocus()
cp.lv<-levelplot(cp,col.regions=rev.cols,alpha.regions=0.7,
scales = list(x=list(rot=90),tck = 0),xlab="",ylab="")
print(cp.lv,split=c(1,2,2,2),more=FALSE,newpage=FALSE,panel.width=hm.width,
panel.height=hm.height,cex=2)
trellis.focus("legend", side="right", clipp.off=TRUE, highlight=FALSE)
grid.text("covariance", 0.2, 0, hjust=0.5, vjust=1.2,gp=gpar(cex=0.75))
trellis.unfocus()
jost.lv<-levelplot(jostpw,col.regions=cols,alpha.regions=0.7,
scales = list(x=list(rot=90),tck = 0),xlab="",ylab="")
print(jost.lv,split=c(2,1,2,2),more=FALSE,newpage=FALSE,panel.width=hm.width,
panel.height=hm.height,cex=2)
trellis.focus("legend", side="right", clipp.off=TRUE, highlight=FALSE)
grid.text(expression("Jost's"~italic(D)), 0.2, 0, hjust=0.5, vjust=1.2,gp=gpar(cex=0.75))
trellis.unfocus()
ptdist.lv<-levelplot(pt.dist,col.regions=cols,alpha.regions=0.7,
scales = list(x=list(rot=90),tck = 0),xlab="",ylab="")
print(ptdist.lv,split=c(2,2,2,2),more=FALSE,newpage=FALSE,panel.width=hm.width,
panel.height=hm.height,cex=2)
trellis.focus("legend", side="right", clipp.off=TRUE, highlight=FALSE)
grid.text("PopTree2\nDistance", 0.2, 0, hjust=0.5, vjust=1.2,gp=gpar(cex=0.75))
trellis.unfocus()
dev.off()
Figure 3. Heatmap
Figure 4 is the Poptree2 and Treemix trees next to each other
plotName<-"p4.trees.png"
tm.tree<-"treemix/p4_k100bFLPBrm4"
pt.subtree<-read.tree("poptree/p4.nwk")
rect.start<-0.1
clcolr <- rep("grey27", dim(pt.subtree$edge)[1])
clcolr[c(6,19,20,21,22)]<-all.colors[3]
pt.subtree$node.label<-round(as.numeric(pt.subtree$node.label)*100)
pt.subtree$tip.label[pt.subtree$tip.label=="FLLG"]<-"FLFW"
png(plotName,height=5,width=10,units="in",res=300)
par(mfrow=c(1,2),oma=c(1,0,1,0),mar=c(1,1,1,0.1))
plot.phylo(pt.subtree,tip.color = pt.colors,#align.tip.label = T,#show.node.label = TRUE,
edge.color = clcolr,edge.width = 2,label.offset = 0.0015,font=1)
nodelabels(pt.subtree$node.label,cex=0.75,font=2,frame="none",adj=c(1,-0.2))
mtext("PopTree2",3)
t3<-plot_tree(tm.tree,"treemix/poporder",plus=0.05,scale=F,mbar=F,arrow=0.1,
tip.order = tip.names,lwd = 2,branch.cols = branch.cols,xlab=F)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 1 FLSG NOT_ROOT NOT_MIG TIP 16 NA NA NA NA
## 2 2 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 51 1 16 8
## 3 3 TXCC NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 4 4 FLFD NOT_ROOT NOT_MIG TIP 256 NA NA NA NA
## 5 15 FLPB NOT_ROOT NOT_MIG TIP 473 NA NA NA NA
## 6 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 2 1 1 304 7
## 7 31 FLLG NOT_ROOT NOT_MIG TIP 412 NA NA NA NA
## 8 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 473 52 12 212 3
## 9 51 FLKB NOT_ROOT NOT_MIG TIP 2 NA NA NA NA
## 10 52 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 2 9 256 3
## 11 75 TXFW NOT_ROOT NOT_MIG TIP 104 NA NA NA NA
## 12 76 <NA> NOT_ROOT NOT_MIG NOT_TIP 304 472 2 303 1
## 13 103 ALFW NOT_ROOT NOT_MIG TIP 472 NA NA NA NA
## 14 104 <NA> NOT_ROOT NOT_MIG NOT_TIP 304 75 1 356 3
## 15 135 FLAB NOT_ROOT NOT_MIG TIP 588 NA NA NA NA
## 16 136 <NA> NOT_ROOT MIG NOT_TIP 32 52 NA NA NA
## 17 171 TXSP NOT_ROOT NOT_MIG TIP 172 NA NA NA NA
## 18 172 <NA> NOT_ROOT NOT_MIG NOT_TIP 356 3 1 171 1
## 19 211 FLHB NOT_ROOT NOT_MIG TIP 212 NA NA NA NA
## 20 212 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 211 1 412 2
## 21 255 FLSI NOT_ROOT NOT_MIG TIP 588 NA NA NA NA
## 22 256 <NA> NOT_ROOT NOT_MIG NOT_TIP 52 4 1 588 2
## 23 303 ALST NOT_ROOT NOT_MIG TIP 76 NA NA NA NA
## 24 304 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 104 4 76 3
## 25 355 TXCB NOT_ROOT NOT_MIG TIP 356 NA NA NA NA
## 26 356 <NA> NOT_ROOT NOT_MIG NOT_TIP 104 172 2 355 1
## 27 411 FLCC NOT_ROOT NOT_MIG TIP 412 NA NA NA NA
## 28 412 <NA> NOT_ROOT NOT_MIG NOT_TIP 212 411 1 31 1
## 29 471 LAFW NOT_ROOT NOT_MIG TIP 472 NA NA NA NA
## 30 472 <NA> NOT_ROOT NOT_MIG NOT_TIP 76 471 1 103 1
## 31 473 <NA> ROOT NOT_MIG NOT_TIP 473 15 1 32 15
## 32 490 <NA> NOT_ROOT MIG NOT_TIP 104 75 NA NA NA
## 33 546 <NA> NOT_ROOT MIG NOT_TIP 412 31 NA NA NA
## 34 588 <NA> NOT_ROOT NOT_MIG NOT_TIP 256 255 1 135 1
## 35 641 <NA> NOT_ROOT MIG NOT_TIP 104 75 NA NA NA
## V11
## 1 FLSG:0.000248367
## 2 (FLKB:0.000474655,(FLSG:0.000248367,((TXFW:0.0269586,((TXCC:0.000281411,TXSP:0.00121161):0.00205677,TXCB:0.00319305):0.00396619):0.00747066,((LAFW:0.0118029,ALFW:0.00569026):0.0176978,ALST:0):0.00502786):0.0120792):0.000664825):0.00127358
## 3 TXCC:0.000281411
## 4 FLFD:0.000926406
## 5 FLPB:0
## 6 (FLSG:0.000248367,((TXFW:0.0269586,((TXCC:0.000281411,TXSP:0.00121161):0.00205677,TXCB:0.00319305):0.00396619):0.00747066,((LAFW:0.0118029,ALFW:0.00569026):0.0176978,ALST:0):0.00502786):0.0120792):0.000664825
## 7 FLLG:0.0510187
## 8 (((FLKB:0.000474655,(FLSG:0.000248367,((TXFW:0.0269586,((TXCC:0.000281411,TXSP:0.00121161):0.00205677,TXCB:0.00319305):0.00396619):0.00747066,((LAFW:0.0118029,ALFW:0.00569026):0.0176978,ALST:0):0.00502786):0.0120792):0.000664825):0.00127358,(FLFD:0.000926406,(FLSI:8.2722e-05,FLAB:0.0146583):0.00107509):0.00123607):0.0203382,(FLHB:0.000682617,(FLCC:0.00110869,FLLG:0.0510187):0.000504332):0.00107155):0
## 9 FLKB:0.000474655
## 10 ((FLKB:0.000474655,(FLSG:0.000248367,((TXFW:0.0269586,((TXCC:0.000281411,TXSP:0.00121161):0.00205677,TXCB:0.00319305):0.00396619):0.00747066,((LAFW:0.0118029,ALFW:0.00569026):0.0176978,ALST:0):0.00502786):0.0120792):0.000664825):0.00127358,(FLFD:0.000926406,(FLSI:8.2722e-05,FLAB:0.0146583):0.00107509):0.00123607):0.0203382
## 11 TXFW:0.0269586
## 12 ((LAFW:0.0118029,ALFW:0.00569026):0.0176978,ALST:0):0.00502786
## 13 ALFW:0.00569026
## 14 (TXFW:0.0269586,((TXCC:0.000281411,TXSP:0.00121161):0.00205677,TXCB:0.00319305):0.00396619):0.00747066
## 15 FLAB:0.0146583
## 16 <NA>
## 17 TXSP:0.00121161
## 18 (TXCC:0.000281411,TXSP:0.00121161):0.00205677
## 19 FLHB:0.000682617
## 20 (FLHB:0.000682617,(FLCC:0.00110869,FLLG:0.0510187):0.000504332):0.00107155
## 21 FLSI:8.2722e-05
## 22 (FLFD:0.000926406,(FLSI:8.2722e-05,FLAB:0.0146583):0.00107509):0.00123607
## 23 ALST:0
## 24 ((TXFW:0.0269586,((TXCC:0.000281411,TXSP:0.00121161):0.00205677,TXCB:0.00319305):0.00396619):0.00747066,((LAFW:0.0118029,ALFW:0.00569026):0.0176978,ALST:0):0.00502786):0.0120792
## 25 TXCB:0.00319305
## 26 ((TXCC:0.000281411,TXSP:0.00121161):0.00205677,TXCB:0.00319305):0.00396619
## 27 FLCC:0.00110869
## 28 (FLCC:0.00110869,FLLG:0.0510187):0.000504332
## 29 LAFW:0.0118029
## 30 (LAFW:0.0118029,ALFW:0.00569026):0.0176978
## 31 (FLPB:0,(((FLKB:0.000474655,(FLSG:0.000248367,((TXFW:0.0269586,((TXCC:0.000281411,TXSP:0.00121161):0.00205677,TXCB:0.00319305):0.00396619):0.00747066,((LAFW:0.0118029,ALFW:0.00569026):0.0176978,ALST:0):0.00502786):0.0120792):0.000664825):0.00127358,(FLFD:0.000926406,(FLSI:8.2722e-05,FLAB:0.0146583):0.00107509):0.00123607):0.0203382,(FLHB:0.000682617,(FLCC:0.00110869,FLLG:0.0510187):0.000504332):0.00107155):0);
## 32 <NA>
## 33 <NA>
## 34 (FLSI:8.2722e-05,FLAB:0.0146583):0.00107509
## 35 <NA>
## x y ymin ymax
## 1 0.022525012 0.84375 0.8125 0.8750
## 2 0.021611820 0.87500 0.3750 0.9375
## 3 0.046415117 0.71875 0.6875 0.7500
## 4 0.022500716 0.34375 0.3125 0.3750
## 5 0.000000000 0.96875 0.9375 1.0000
## 6 0.022276645 0.81250 0.3750 0.8750
## 7 0.052594582 0.03125 0.0000 0.0625
## 8 0.000000000 0.18750 0.0000 0.9375
## 9 0.022086475 0.90625 0.8750 0.9375
## 10 0.020338240 0.37500 0.1875 0.9375
## 11 0.067069276 0.78125 0.7500 0.8125
## 12 0.037667946 0.43750 0.3750 0.5625
## 13 0.053147151 0.46875 0.4375 0.5000
## 14 0.040110746 0.75000 0.5625 0.8125
## 15 0.029966867 0.21875 0.1875 0.2500
## 16 0.011480800 NA 0.0000 0.1875
## 17 0.047345316 0.65625 0.6250 0.6875
## 18 0.046133706 0.68750 0.6250 0.7500
## 19 0.001754167 0.15625 0.1250 0.1875
## 20 0.001071550 0.12500 0.0000 0.1875
## 21 0.022732122 0.28125 0.2500 0.3125
## 22 0.021574310 0.31250 0.1875 0.3750
## 23 0.037667946 0.40625 0.3750 0.4375
## 24 0.032640086 0.56250 0.3750 0.8125
## 25 0.046647245 0.59375 0.5625 0.6250
## 26 0.044076936 0.62500 0.5625 0.7500
## 27 0.002684572 0.09375 0.0625 0.1250
## 28 0.001575882 0.06250 0.0000 0.1250
## 29 0.059259791 0.53125 0.5000 0.5625
## 30 0.047456891 0.50000 0.4375 0.5625
## 31 0.000000000 0.93750 0.0000 1.0000
## 32 0.064357446 NA 0.5625 0.7500
## 33 0.022581282 NA 0.0000 0.0625
## 34 0.022649400 0.25000 0.1875 0.3125
## 35 0.067069276 NA 0.5625 0.7500
## [1] "0.564492 0 0.02033824"
## [1] "0.899407 0.0401107462921379 0.0670692762921379"
## [1] "0.41172 0.001575882 0.052594582"
## [1] "0.899407 0.0401107462921379 0.0670692762921379"
## [1] 0.022525012 0.046415117 0.022500716 0.000000000 0.052594582
## [6] 0.022086475 0.067069276 0.053147151 0.029966867 0.047345316
## [11] 0.001754167 0.022732122 0.037667946 0.046647245 0.002684572
## [16] 0.059259791
## [1] 0.003
mtext("Treemix",3)
ybar<-0.01
mcols = rev( heat.colors(150) )
mcols = mcols[50:length(mcols)]
ymi = ybar+rect.start
yma = ybar+0.3
l = 0.2
w = l/100
xma = max(t3$d$x/20)
rect( rep(rect.start, 100), ymi+(0:99)*w, rep(rect.start+xma, 100), ymi+(1:100)*w, col = mcols, border = mcols)
text(rect.start+xma+0.001, ymi, lab = "0", adj = 0)
text(rect.start+xma+0.001, yma, lab = "0.5", adj = 0)
text(rect.start, yma+0.07, lab = "Migration", adj = 0 )
text(rect.start, yma+0.04, lab = "weight", adj = 0 )
dev.off()
## png
## 2
Figure 4. Trees
I need to re-run populations because I don’t seem to have the correct fst files. I can use the whitelist and hopefully include bootstrapping and kernel smoothing:
populations -b 2 -W fwsw_results/subset.whitelist.txt -P fwsw_results/stacks -M fwsw_pops_map.txt --fstats --vcf_haplotypes --genomic --bootstrap -k
And then I will identify the shared outliers among them. But first, let’s define some names.
vcf<-parse.vcf("p4.upd.vcf")
vcf$SNP<-paste(vcf$`#CHROM`,vcf$POS,sep=".")
scaffs<-levels(as.factor(vcf[,1]))
scaffs[1:22]<-lgs
scaff.starts<-tapply(vcf$POS,vcf$`#CHROM`,max)
scaff.starts<-data.frame(rbind(cbind(names(scaff.starts),scaff.starts)),stringsAsFactors = F)
locus.info<-c(colnames(vcf[1:9]),"SNP")
dd.plot.name<-as.character("separate_delta-divergence.png")
dd.name<-as.character("sep.deltadivergence.txt")
sdd.name<-as.character("sep.smoothedDD.out.txt")
afs.plot.name<-as.character("p4.All_AFS.png")
stacks.sig.out<-"p4.stacks.sig.snps.txt"
annotations.name<-"p4.StacksFWSWOutliers_annotatedByGenome.csv"
bf.file<-"bayenv/p4.bf.txt"
fwsw.tx<-read.delim("stacks/p4.fst_TXCC-TXFW.txt")
fwsw.la<-read.delim("stacks/p4.fst_ALST-LAFW.txt")
fwsw.al<-read.delim("stacks/p4.fst_ALFW-ALST.txt")
fwsw.fl<-read.delim("stacks/p4.fst_FLCC-FLLG.txt")
fwsw<-read.delim("stacks/fw-sw_populations/batch_2.fst_marine-freshwater.tsv")
#Compare neighboring pops.
tx.sig<-fwsw.tx[fwsw.tx$Fisher.s.P<0.01,"Locus.ID"]
la.sig<-fwsw.la[fwsw.la$Fisher.s.P<0.01,"Locus.ID"]
al.sig<-fwsw.al[fwsw.al$Fisher.s.P<0.01,"Locus.ID"]
fl.sig<-fwsw.fl[fwsw.fl$Fisher.s.P<0.01,"Locus.ID"]
length(tx.sig[(tx.sig %in% c(la.sig,al.sig,fl.sig))])
length(la.sig[(la.sig %in% c(tx.sig,al.sig,fl.sig))])
length(al.sig[(al.sig %in% c(la.sig,tx.sig,fl.sig))])
all.shared<-fl.sig[fl.sig %in% la.sig & fl.sig %in% al.sig & fl.sig %in% tx.sig]
fw.shared.chr<-fwsw.tx[fwsw.tx$Locus.ID %in% all.shared,c("Locus.ID","Chr","BP","Column","Overall.Pi")]
tapply(fw.shared.chr$Locus.ID,factor(fw.shared.chr$Chr),function(x){ length(unique(x)) })
#are they using the same SNPs or different SNPs?
stacks.sig<-data.frame(nrow=nrow(fw.shared.chr),ncol=4)
for(i in 1:nrow(fw.shared.chr)){
tx.bp<-fwsw.tx[fwsw.tx$Fisher.s.P<0.01 & fwsw.tx$Locus.ID == fw.shared.chr[i,"Locus.ID"],"BP"]
la.bp<-fwsw.la[fwsw.la$Fisher.s.P<0.01 & fwsw.la$Locus.ID == fw.shared.chr[i,"Locus.ID"],"BP"]
al.bp<-fwsw.al[fwsw.al$Fisher.s.P<0.01 & fwsw.al$Locus.ID == fw.shared.chr[i,"Locus.ID"],"BP"]
fl.bp<-fwsw.fl[fwsw.fl$Fisher.s.P<0.01 & fwsw.fl$Locus.ID == fw.shared.chr[i,"Locus.ID"],"BP"]
stacks.sig[i,1]<-paste(tx.bp,sep=",",collapse = ",")
stacks.sig[i,2]<-paste(la.bp,sep=",",collapse = ",")
stacks.sig[i,3]<-paste(al.bp,sep=",",collapse = ",")
stacks.sig[i,4]<-paste(fl.bp,sep=",",collapse = ",")
}
colnames(stacks.sig)<-c("TX","LA","AL","FL")
stacks.sig<-data.frame(cbind(fw.shared.chr,stacks.sig))
stacks.sig$SNP<-paste(stacks.sig$Chr,stacks.sig$BP,sep=".")
write.table(stacks.sig,stacks.sig.out,sep='\t',row.names=FALSE,col.names=TRUE)
#+ annotations, eval=FALSE
gff<-read.delim(gzfile("../../scovelli_genome/ssc_2016_12_20_chromlevel.gff.gz"),header=F)
colnames(gff)<-c("seqname","source","feature","start","end","score","strand","frame","attribute")
genome.blast<-read.csv("../../scovelli_genome/ssc_2016_12_20_cds_nr_blast_results.csv",skip=1,header=T)#I saved it as a csv
#' extract the significant regions from the gff file
fw.sig.reg<-do.call(rbind,apply(fw.shared.chr,1,function(sig){
this.gff<-gff[as.character(gff$seqname) %in% as.character(unlist(sig["Chr"])),]
this.reg<-this.gff[this.gff$start <= as.numeric(sig["BP"]) & this.gff$end >= as.numeric(sig["BP"]),]
if(nrow(this.reg) == 0){
if(as.numeric(sig["BP"])>max(as.numeric(this.gff$end))){
new<-data.frame(Locus=sig["Locus.ID"],Chr=sig["Chr"],BP=sig["BP"],SNPCol=sig["Column"],
region="beyond.last.contig", description=NA,SSCID=NA)
}else{
new<-data.frame(Locus=sig["Locus.ID"],Chr=sig["Chr"],BP=sig["BP"],SNPCol=sig["Column"],
region=NA,description=NA,SSCID=NA)
}
}else{
if(length(grep("SSCG\\d+",this.reg$attribute))>0){
geneID<-unique(gsub(".*(SSCG\\d+).*","\\1",this.reg$attribute[grep("SSCG\\d+",this.reg$attribute)]))
gene<-genome.blast[genome.blast$sscv4_gene_ID %in% geneID,"blastp_hit_description"]
}else{
geneID<-NA
gene<-NA
}
new<-data.frame(Locus=sig["Locus.ID"],Chr=sig["Chr"],BP=sig["BP"],SNPCol=sig["Column"],
region=paste(this.reg$feature,sep=",",collapse = ","),description=gene,SSCID=geneID)
}
return(as.data.frame(new))
}))
write.csv(fw.sig.reg,annotations.name)
fw.sig.reg<-read.csv(annotations.name)
There are 53 shared SNPs using the Fisher’s P as a cutoff.
The bayes factors and Fsts are all in the following chunk.
fwsw<-read.delim("stacks/fw-sw_populations/batch_2.fst_marine-freshwater.tsv")
#Compare neighboring pops.
tx.sig<-fwsw.tx[fwsw.tx$Fisher.s.P<0.01,"Locus.ID"]
la.sig<-fwsw.la[fwsw.la$Fisher.s.P<0.01,"Locus.ID"]
al.sig<-fwsw.al[fwsw.al$Fisher.s.P<0.01,"Locus.ID"]
fl.sig<-fwsw.fl[fwsw.fl$Fisher.s.P<0.01,"Locus.ID"]
length(tx.sig[(tx.sig %in% c(la.sig,al.sig,fl.sig))])
## [1] 962
length(la.sig[(la.sig %in% c(tx.sig,al.sig,fl.sig))])
## [1] 579
length(al.sig[(al.sig %in% c(la.sig,tx.sig,fl.sig))])
## [1] 679
all.shared<-fl.sig[fl.sig %in% la.sig & fl.sig %in% al.sig & fl.sig %in% tx.sig]
bf<-read.delim(bf.file)
bf.co<-apply(bf[,5:7],2,quantile,0.99) #focus on Bayes Factors, because of Lotterhos & Whitlock (2015)
sal.bf.sig<-bf[bf$Salinity_BF>bf.co["Salinity_BF"],c(1,2,4,8,9,6)]
stacks.sig<-read.delim(stacks.sig.out)
Figure 5 includes Fst data, Bayes Factors data, colors from Structure, and reciprocal monophyly data, plus smoothed Fst values (which I think I will leave out this round).
Now I’m ready to plot Figure 5.
fig5.name<-"p4_stacks_fsts_fwsw_bf.png"
addLines<-FALSE
addSmooth<-TRUE
#' Set up the plotting utilities
all.chr<-data.frame(Chr=c(as.character(fwsw.tx$Chr),as.character(fwsw.la$Chr),
as.character(fwsw.al$Chr),as.character(fwsw.fl$Chr),
as.character(bf$scaffold)),
BP=c(fwsw.tx$BP,fwsw.la$BP,fwsw.al$BP,fwsw.fl$BP,bf$BP),stringsAsFactors = F)
bounds<-tapply(as.numeric(as.character(all.chr$BP)), all.chr$Chr,max)
bounds<-data.frame(Chrom=dimnames(bounds),End=bounds)
colnames(bounds)<-c("Chrom","End")
plot.scaffs<-scaffs[scaffs %in% bounds$Chr]
plot.scaffs[1:22]<-lgs
bounds<-bounds[match(plot.scaffs,bounds$Chrom),]
#generate info
fwsw.plot<-assign.plotpos(fwsw,plot.scaffs,bounds,df.bp="BP",df.chrom = "Chr")
addLines<-TRUE
addLines<-FALSE
addSmooth<-TRUE
#' plot with the outlier regions
#' Does NOT include monophyletic neighborjoining trees.
png(fig5.name,height=6,width=8,units="in",res=300)
par(mfrow=c(5,1),mar=c(0.85,2,0,0.5),oma=c(1,1,1,0.5))
#par(fig=c(0,1,0.9-0.9/5,0.9))
fwswt.fst<-fst.plot(fwsw.tx,fst.name = "Corrected.AMOVA.Fst", bp.name = "BP",chrom.name = "Chr",
scaffs.to.plot=plot.scaffs, y.lim = c(0,1),scaffold.widths = bounds,pch=19,
pt.cols = c(grp.colors[1],grp.colors[2]),pt.cex=1,axis.size = 1)
if(addLines==TRUE){ perlg.add.lines(fwsw.plot,lgs) }
if(addSmooth==TRUE){ points(fwswt.fst$plot.pos,fwswt.fst$Smoothed.Fst,col="cornflowerblue",type="l") }
#points(fwswt.fst$BP,fwswt.fst$Corrected.AMOVA.Fst,pch=21,bg=grp.colors[1])
#points(fwswt.fst$plot.pos[fwswt.fst$Locus.ID %in% all.shared],fwswt.fst$Corrected.AMOVA.Fst[fwswt.fst$Locus.ID %in% all.shared],
# pch=1,col="black",cex=1.3)
clip(0,max(fwswt.fst$plot.pos),0,1)
abline(v=fwswt.fst$plot.pos[fwswt.fst$Locus.ID %in% all.shared],col="gray47")
mtext("TXFW vs. TXCC",2,cex=0.75)#,line=-1)
labs<-tapply(fwswt.fst$plot.pos,fwswt.fst$Chr,median)
text(x=labs[lgs],y=-0.1,labels=lgn,xpd=TRUE)
fwswl.fst<-fst.plot(fwsw.la,fst.name = "Corrected.AMOVA.Fst", bp.name = "BP",chrom.name = "Chr",
scaffs.to.plot=plot.scaffs, y.lim = c(0,1),scaffold.widths = bounds,pch=19,
pt.cols=c(grp.colors[2],grp.colors[3]),pt.cex=1,axis.size=1)
if(addLines==TRUE){ perlg.add.lines(fwsw.plot,lgs) }
if(addSmooth==TRUE){ points(fwswl.fst$plot.pos,fwswl.fst$Smoothed.Fst,col="cornflowerblue",type="l") }
#points(fwswl.fst$BP,fwswl.fst$Corrected.AMOVA.Fst,pch=21,bg=grp.colors[3])
#points(fwswl.fst$plot.pos[fwswl.fst$Locus.ID %in% all.shared],fwswl.fst$Corrected.AMOVA.Fst[fwswl.fst$Locus.ID %in% all.shared],
# pch=1,col="black",cex=1.3)
clip(0,max(fwswl.fst$plot.pos),0,1)
abline(v=fwswl.fst$plot.pos[fwswl.fst$Locus.ID %in% all.shared],col="gray47")
mtext("LAFW vs. ALST",2,cex=0.75)#,line=-1)
labs<-tapply(fwswl.fst$plot.pos,fwswl.fst$Chr,median)
text(x=labs[lgs],y=-0.1,labels=lgn,xpd=TRUE)
fwswa.fst<-fst.plot(fwsw.al,fst.name = "Corrected.AMOVA.Fst", bp.name = "BP",chrom.name = "Chr",
scaffs.to.plot=plot.scaffs, y.lim = c(0,1),scaffold.widths = bounds,pch=19,
pt.cols=c(grp.colors[3],grp.colors[2]),pt.cex=1,axis.size = 1)
if(addLines==TRUE){ perlg.add.lines(fwsw.plot,lgs) }
if(addSmooth==TRUE){ points(fwswa.fst$plot.pos,fwswa.fst$Smoothed.Fst,col="cornflowerblue",type="l") }
#points(fwswa.fst$BP,fwswa.fst$Corrected.AMOVA.Fst,pch=21,bg=grp.colors[4])
#points(fwswa.fst$plot.pos[fwswa.fst$Locus.ID %in% all.shared],fwswa.fst$Corrected.AMOVA.Fst[fwswa.fst$Locus.ID %in% all.shared],
# pch=1,col="black",cex=1.3)
clip(0,max(fwswa.fst$plot.pos),0,1)
abline(v=fwswa.fst$plot.pos[fwswa.fst$Locus.ID %in% all.shared],col="gray47")
mtext("ALFW vs. ALST",2,cex=0.75)#,line=-1)
labs<-tapply(fwswa.fst$plot.pos,fwswa.fst$Chr,median)
text(x=labs[lgs],y=-0.1,labels=lgn,xpd=TRUE)
fwswf.fst<-fst.plot(fwsw.fl,fst.name = "Corrected.AMOVA.Fst", bp.name = "BP",chrom.name = "Chr",
scaffs.to.plot=plot.scaffs, y.lim = c(0,1),scaffold.widths = bounds,pch=19,
pt.cols=c(grp.colors[6],grp.colors[5]),pt.cex=1,axis.size=1)
if(addLines==TRUE){ perlg.add.lines(fwsw.plot,lgs) }
if(addSmooth==TRUE){ points(fwswf.fst$plot.pos,fwswf.fst$Smoothed.Fst,col="cornflowerblue",type="l") }
#points(fwswf.fst$BP,fwswf.fst$Corrected.AMOVA.Fst,pch=21,bg=grp.colors[6])
#points(fwswf.fst$plot.pos[fwswf.fst$Locus.ID %in% all.shared],fwswf.fst$Corrected.AMOVA.Fst[fwswf.fst$Locus.ID %in% all.shared],
# pch=1,col="black",cex=1.3)
clip(0,max(fwswf.fst$plot.pos),0,1)
abline(v=fwswf.fst$plot.pos[fwswf.fst$Locus.ID %in% all.shared],col="gray47")
mtext("FLFW vs. FLCC",2,cex=0.75)#,line=-1)
mtext(expression(bold(italic(F)[ST])),2,outer=T,line=-1,cex=0.75)
labs<-tapply(fwswf.fst$plot.pos,fwswf.fst$Chr,median)
text(x=labs[lgs],y=-0.1,labels=lgn,xpd=TRUE)
#BF
bs.sal<-fst.plot(bf,fst.name="logSal",chrom.name="scaffold",bp.name = "BP",scaffold.widths=bounds,
scaffs.to.plot = plot.scaffs,pch=19,axis.size = 1,pt.cex = 1)
points(bs.sal[bs.sal$SNP %in% sal.bf.sig$SNP,"plot.pos"],
bs.sal[bs.sal$SNP %in% sal.bf.sig$SNP,"logSal"],
col="cornflowerblue",pch=19)#give the outliers a color
clip(0,max(bs.sal$plot.pos),-3,241)
abline(v=fwswf.fst$plot.pos[fwswf.fst$Locus.ID %in% all.shared],col="gray47")
mtext("log(Salinity BF)",2,cex=0.75,line=2.1)
labs<-tapply(bs.sal$plot.pos,bs.sal$scaffold,median)
text(x=labs[lgs],y=-5,labels=lgn,xpd=TRUE)
#points(bs.sal[bs.sal$SNP %in% stacks.sig$SNP,"plot.pos"],
# bs.sal[bs.sal$SNP %in% stacks.sig$SNP,"logSal"],
# col="cornflowerblue",cex=1.3)
#clip(min(bs.sal$plot.pos),max(bs.sal$plot.pos),
# min(bs.sal$logSal),max(bs.sal$logSal))
#abline(h=log(bf.co["Salinity_BF"]),col="cornflowerblue",lwd=2)
dev.off()
## png
## 2
Figure 5. Fsts
There seems to be an overabundance of shared outliers on LG4. I can test this using the multinomial distribution.
if(!("stacks.sig" %in% ls())){
stacks.sig<-read.delim("p4.stacks.sig.snps.txt")
}
if(is.null(stacks.sig$SNP)){ #make sure the SNP column is there.
stacks.sig$SNP<-paste(stacks.sig$Chr,as.numeric(stacks.sig$BP)+1,sep=".")
}
sig.chrom<-stacks.sig[stacks.sig$Chr %in% lgs,]
sig.chrom$Chr<-factor(sig.chrom$Chr)
nloci<-nrow(sig.chrom)
nchrom<-length(lgs)
exp.perchrom<-rep(nloci/length(lgs),length(lgs))
names(exp.perchrom)<-lgs
exp.perchrom
## LG1 LG2 LG3 LG4 LG5 LG6 LG7 LG8
## 2.136364 2.136364 2.136364 2.136364 2.136364 2.136364 2.136364 2.136364
## LG9 LG10 LG11 LG12 LG13 LG14 LG15 LG16
## 2.136364 2.136364 2.136364 2.136364 2.136364 2.136364 2.136364 2.136364
## LG17 LG18 LG19 LG20 LG21 LG22
## 2.136364 2.136364 2.136364 2.136364 2.136364 2.136364
obs.perchrom<-tapply(sig.chrom$Locus.ID,sig.chrom$Chr,length)[lgs]
names(obs.perchrom)<-lgs
obs.perchrom[is.na(obs.perchrom)]<-0
obs.perchrom
## LG1 LG2 LG3 LG4 LG5 LG6 LG7 LG8 LG9 LG10 LG11 LG12 LG13 LG14 LG15
## 5 4 1 14 0 3 0 0 1 1 0 5 1 6 0
## LG16 LG17 LG18 LG19 LG20 LG21 LG22
## 0 0 2 1 3 0 0
dmultinom(x = obs.perchrom,prob = exp.perchrom)
## [1] 1.333996e-25
I can also look for overlapping loci in pairwise saltwater-saltwater comparisons.
swsw.name<-"p4_stacks_swsw.png"
swsw.tx<-read.delim("stacks/p4.fst_TXCB-TXCC.txt")
swsw.al<-read.delim("stacks/p4.fst_ALST-FLSG.txt")
swsw.fl<-read.delim("stacks/p4.fst_FLCC-FLHB.txt")
###### SW-SW neighbors ######
tx.sw.sig<-swsw.tx[swsw.tx$Fisher.s.P<0.01,"Locus.ID"]
al.sw.sig<-swsw.al[swsw.al$Fisher.s.P<0.01,"Locus.ID"]
fl.sw.sig<-swsw.fl[swsw.fl$Fisher.s.P<0.01,"Locus.ID"]
length(tx.sw.sig[(tx.sw.sig %in% c(al.sw.sig,fl.sw.sig))])
## [1] 106
length(al.sw.sig[(al.sw.sig %in% c(tx.sw.sig,fl.sw.sig))])
## [1] 135
length(fl.sw.sig[(fl.sw.sig %in% c(tx.sw.sig,al.sw.sig))])
## [1] 35
sw.shared<-fl.sw.sig[fl.sw.sig %in% al.sw.sig & fl.sw.sig %in% tx.sw.sig]
png(swsw.name,height=6,width=7.5,units="in",res=300)
par(mfrow=c(3,1),mar=c(0.85,2,0,0.5),oma=c(1,1,1,0.5))
swswt.fst<-fst.plot(swsw.tx,fst.name = "Corrected.AMOVA.Fst", bp.name = "BP",chrom.name = "Chr",
scaffs.to.plot=plot.scaffs, y.lim = c(0,1),scaffold.widths = bounds,axis.size = 1,
pt.cols=c(grp.colors[1],grp.colors[2]),pt.cex = 1,pch=19)
mtext("TXCC vs. TXCB",3,line=-1,cex=0.75)
swswa.fst<-fst.plot(swsw.al,fst.name = "Corrected.AMOVA.Fst", bp.name = "BP",chrom.name = "Chr", y.lim=c(0,1),
scaffs.to.plot=plot.scaffs, scaffold.widths = bounds,axis.size = 1,
pt.cols=c(grp.colors[2],grp.colors[3]),pt.cex = 1,pch=19)
#points(swswa.fst$plot.pos,swswa.fst$Corrected.AMOVA.Fst,pch=21,bg=grp.colors[3])
mtext("ALST vs. FLSG",3,line=-1,cex=0.75)
swswf.fst<-fst.plot(swsw.fl,fst.name = "Corrected.AMOVA.Fst", bp.name = "BP",chrom.name = "Chr",
scaffs.to.plot=plot.scaffs, y.lim = c(0,1),scaffold.widths = bounds,axis.size = 1,
pt.cols=c(grp.colors[6],grp.colors[5]),pt.cex = 1,pch=19)
#points(swswf.fst$plot.pos,swswf.fst$Corrected.AMOVA.Fst,pch=21,bg=grp.colors[6])
mtext("FLCC vs. FLHB",3,line=-1,cex=0.75)
mtext(expression(italic(F)[ST]),2,outer=T,line=-0.75,cex=0.75)
last<-0
for(i in 1:length(lgs)){
text(x=mean(swswf.fst[swswf.fst$Chr ==lgs[i],"plot.pos"]),y=-0.07,
labels=lgn[i], adj=1, xpd=TRUE)
last<-max(swswf.fst[swswf.fst$Chr ==lgs[i],"plot.pos"])
}
dev.off()
## png
## 2
SWSW Fsts
In Figure 6, we show different measures of diversity: nucleotide diversity (\(\pi\)), heterozygosity (H), Jost’s D, and \(\delta\)-divergence. Additionally, we include the shared Fst outliers and putative loci, and highlight regions with high \(\pi\) and H.
#' Calculate AFS from vcf
fw.afs<-lapply(fw.list,function(pop){
this.vcf<-cbind(vcf[,locus.info],vcf[,grep(pop,colnames(vcf))])
this.afs<-do.call(rbind,apply(this.vcf,1,calc.afs.vcf))
})
names(fw.afs)<-c("TXFW","LAFW","ALFW","FLFW")
sw.afs<-lapply(sw.list,function(pop){
this.vcf<-cbind(vcf[,locus.info],vcf[,grep(pop,colnames(vcf))])
this.afs<-do.call(rbind,apply(this.vcf,1,calc.afs.vcf))
})
names(sw.afs)<-sw.list
all.afs<-c(fw.afs,sw.afs)
Figure S1. Fsts
pi.file.name<-"p4.pi.txt"
avgpi.file.name<-"p4.avgpi.txt"
all.pi<-data.frame(Chrom=vcf$`#CHROM`,Pos=vcf$POS,Pi=unlist(apply(vcf,1,calc.pi)))
all.pi$SNP<-paste(all.pi$Chrom,as.numeric(as.character(all.pi$Pos)),sep=".")
write.table(all.pi,pi.file.name,col.names = TRUE,row.names=FALSE, quote=FALSE,sep='\t')
all.pi<-read.table(pi.file.name,header=T)
avg.pi.adj<-read.table(avgpi.file.name,header=T)
all.het.name<-"p4.avg.het.txt"
avg.het.adj.name<-"p4.avg.het.adj.txt"
#het
avg.het<-do.call("rbind",sliding.window(vcf,lgs,stat="het",width=10))
avg.het.adj<-fst.plot(avg.het,scaffold.widths=scaff.starts,pch=19,
fst.name = "Avg.Stat",chrom.name = "Chr",bp.name = "Avg.Pos")
all.het<-data.frame(Chrom=vcf$`#CHROM`,Pos=vcf$POS,Het=unlist(apply(vcf,1,calc.het)))
all.het$SNP<-paste(all.het$Chrom,as.numeric(as.character(all.het$Pos)),sep=".")
write.table(avg.het.adj,avg.het.adj.name,sep='\t',col.names=TRUE)
write.table(all.het,all.het.name,sep='\t',col.names=TRUE)
avg.het.adj<-read.delim(avg.het.adj.name,header=TRUE)
all.het<-read.delim(all.het.name,header=TRUE)
swfw.mu<-calc.mean.fst(vcf = vcf,pop.list1 = sw.list,pop.list2 = fw.list,maf.cutoff=0.01)
fwfw.mu<-calc.mean.fst(vcf = vcf,pop.list1 = fw.list,pop.list2 = fw.list, maf.cutoff=0.01)
deltad<-merge(swfw.mu,fwfw.mu,by="SNP")
deltad<-deltad[,c("SNP","Chrom.x","Pos.x","Mean.Fst.x","Mean.Fst.y")]
colnames(deltad)<-c("SNP","Chrom","Pos","MeanSWFW.Fst","MeanFWFW.Fst")
deltad$deltad<-deltad$MeanSWFW.Fst - deltad$MeanFWFW.Fst
deltad<-deltad[!is.na(deltad$deltad),]#remove NAs
write.table(deltad, dd.name,sep='\t',col.names = TRUE,row.names = FALSE)
#' ```
This plotting function also generates the smoothed \(\delta\)-divergence values (which are not saved) and identifies the outliers (in 99% and 1% quantiles, and which are saved in sdd.out).
First, I must calculate Jost’s D on each locus in the p4 dataset
sub.genind<-read.structure("stacks/subset.structure.stru",n.ind=698,
n.loc=9638,col.lab=1,col.pop=2,sep='\t',
row.marknames = 2,onerowperind=FALSE,ask=FALSE)
sub.genind@pop<-factor(gsub("sample_(\\w{4}).*","\\1",rownames(sub.genind@tab)))
jostd<-D_Jost(sub.genind)
write.table(jostd$per.locus,"p4.jostd.perlocus.txt",sep='\t',col.names=FALSE,row.names = TRUE,quote=F)
Now this is done, so I don’t need to evaluate it again – I’ll just need to read it in from a file when I want to use it.
jostd<-read.delim(jostd.name,header=F)
colnames(jostd)<-c("locid","D")
jostd$SNP<-vcf$SNP
jostd$POS<-vcf$POS
jostd$Chr<-vcf$`#CHROM`
jostd$ID<-vcf$ID
smoothed.name<-"p4_deltad_pi_het.png"
#assign the plotting positions relative to all loci
stacks.sig<-assign.plotpos(stacks.sig,plot.scaffs = plot.scaffs,bounds=bounds,df.chrom="Chr",df.bp="BP")
dd<-assign.plotpos(deltad,plot.scaffs = plot.scaffs,bounds=bounds,df.chrom="Chrom",df.bp="Pos")
pi<-assign.plotpos(all.pi,plot.scaffs = plot.scaffs,bounds=bounds,df.chrom="Chrom",df.bp="Pos")
ht<-assign.plotpos(all.het,plot.scaffs = plot.scaffs,bounds=bounds,df.chrom="Chrom",df.bp="Pos")
jd<-assign.plotpos(jostd,plot.scaffs = plot.scaffs,bounds=bounds,df.chrom="Chr",df.bp="POS")
colnames(jd)<-c("locid","D","SNP","Pos","Chrom","ID","plot.pos") #standardize the names
smooth.per.chrom<-function(df,lgs,df.chrom,df.bp,df.stat,out.name,...){#span=0.1,degree=2
smooth.all<-data.frame(stringsAsFactors = FALSE) #all smoothed values
smooth.low<-data.frame(stringsAsFactors = FALSE) #lower outliers
smooth.upp<-data.frame(stringsAsFactors = FALSE) #upper outliers
for(i in 1:length(lgs)){
this.chrom<-df[df[,df.chrom] %in% lgs[i],]
this.smooth<-loess.smooth(this.chrom[,df.bp],this.chrom[,df.stat],...)
this.smooth<-data.frame(chr=cbind(rep(as.character(lgs[i]),length(this.smooth$x)),
pos=as.numeric(this.smooth$x),
smoothed.stats=as.numeric(this.smooth$y)),stringsAsFactors = FALSE)
this.upp<-this.chrom[this.chrom[df.stat]>=quantile(this.chrom[,df.stat],0.99),]#choosing the upper outliers
this.low<-this.chrom[this.chrom[,df.stat]<=quantile(this.chrom[,df.stat],0.01),]#choosing the lower outliers
# save the data
smooth.all<-data.frame(rbind(smooth.all,this.smooth))
smooth.low<-rbind(smooth.low,this.low)
smooth.upp<-rbind(smooth.upp,this.upp)
}
smooth.upp$direction<-"upper"
smooth.low$direction<-"lower"
smooth.out<-rbind(smooth.low,smooth.upp) # put the outliers in one df
colnames(smooth.all)<-c("chr","pos","smoothed.stats")
smooth.all$pos<-as.numeric(as.character(smooth.all$pos))
smooth.all$smoothed.stats<-as.numeric(as.character(smooth.all$smoothed.stats))
# save to files
write.table(smooth.out,paste(out.name,"outliers.txt",sep=""),col.names=T,row.names=F,quote=F,sep='\t')
write.table(smooth.all,paste(out.name,"smoothed.txt",sep=""),col.names=T,row.names=F,quote=F,sep='\t')
smoothed<-list(smooth.out,smooth.all)
}
#smooth the statistics
dd.bp.smooth<-smooth.per.chrom(dd,lgs,df.chrom="Chrom",df.bp="Pos",df.stat="deltad",out.name="dd.bp",span=.1,degree=2)
pi.bp.smooth<-smooth.per.chrom(pi,lgs,df.chrom="Chrom",df.bp="Pos",df.stat="Pi",out.name="pi.bp",span=0.1,degree=2)
ht.bp.smooth<-smooth.per.chrom(ht,lgs,df.chrom="Chrom",df.bp="Pos",df.stat="Het",out.name="ht.bp",span=0.1,degree=2)
jd.bp.smooth<-smooth.per.chrom(jd,lgs,df.chrom="Chrom",df.bp="Pos",df.stat="D",out.name="jd.bp",span=0.1,degree=2)
#smooth the statistics
dd.pp.smooth<-smooth.per.chrom(dd,lgs,df.chrom="Chrom",df.bp="plot.pos",df.stat="deltad",out.name="dd",span=0.1,degree=2)
pi.pp.smooth<-smooth.per.chrom(pi,lgs,df.chrom="Chrom",df.bp="plot.pos",df.stat="Pi",out.name="pi",span=0.1,degree=2)
ht.pp.smooth<-smooth.per.chrom(ht,lgs,df.chrom="Chrom",df.bp="plot.pos",df.stat="Het",out.name="ht",span=0.1,degree=2)
jd.pp.smooth<-smooth.per.chrom(jd,lgs,df.chrom="Chrom",df.bp="plot.pos",df.stat="D",out.name="jd",span=0.1,degree=2)
comp.col<-c(Het="#80cdc1",pi="#018571",Fst="black",D="#a6611a",deltad="#dfc27d")
plot.smooth.stats<-function(df,smooth,stat.name,color,ylabel,...){
dd<-fst.plot(fst.dat = df,...)
mtext(ylabel,2,line=1.5,cex=0.75)
points(x=as.numeric(as.character(smooth[[2]]$pos)),
y=as.numeric(as.character(smooth[[2]]$smoothed.stats)),col=color,type="l",lwd=2)
points(x=as.numeric(smooth[[2]]$pos),y=as.numeric(smooth[[2]]$smoothed.stats),col=color,type="l",lwd=2)
points(smooth[[1]]$plot.pos[smooth[[1]]$direction=="upper"],smooth[[1]][smooth[[1]]$direction=="upper",stat.name],
pch=24,bg=color,col=color,cex=0.5) #add outliers
points(smooth[[1]]$plot.pos[smooth[[1]]$direction=="lower"],smooth[[1]][smooth[[1]]$direction=="lower",stat.name],
pch=25,bg=color,col=color,cex=0.5)
}
xlab.lgs<-function(df,lgs,lgn,chr,bp,...){
last<-0
for(i in 1:length(lgs)){
text(x=median(df[df[,chr] ==lgs[i],bp]),labels=lgn[i],...)
last<-max(df[df[,chr] ==lgs[i],bp])
}
}
png(smoothed.name,height=7,width=5,units="in",res=300)
par(mfrow=c(4,1),mar=c(1,1,1,1),oma=c(1,2,1,1),cex=0.75)
t<-plot.smooth.stats(df=dd,smooth=dd.pp.smooth,stat.name="deltad",
color=comp.col["deltad"],fst.name="deltad",bp.name="Pos",
ylabel=expression(paste(delta,"-divergence")),y.lim=c(-0.5,1),
axis=0.75,scaffold.widths=scaff.starts,pch=19,scaffs.to.plot = scaffs)
xlab.lgs(dd,lgs,lgn,"Chrom","plot.pos",adj=1, xpd=TRUE,y=-0.5,cex=0.75)
#plot Jost's D
d<-plot.smooth.stats(df=jd,smooth=jd.pp.smooth,stat.name="D",
color=comp.col["D"],fst.name="D",chrom.name = "Chrom",bp.name="Pos",
ylabel=expression("Jost's"~italic(D)),
axis=0.75,scaffold.widths=scaff.starts,pch=19,scaffs.to.plot = scaffs)
xlab.lgs(jd,lgs,lgn,"Chrom","plot.pos",adj=1, xpd=TRUE,y=-0.075,cex=0.75)
#plot pi
p<-plot.smooth.stats(df=pi,smooth=pi.pp.smooth,stat.name="Pi",
color=comp.col["pi"],fst.name="Pi",chrom.name = "Chrom",bp.name="Pos",
ylabel=expression(pi),y.lim=c(0,0.5),
axis=0.75,scaffold.widths=scaff.starts,pch=19,scaffs.to.plot = scaffs)
xlab.lgs(pi,lgs,lgn,"Chrom","plot.pos",adj=1, xpd=TRUE,y=-0.05,cex=0.75)
#abline(v=stacks.sig$plot.pos,col="grey10") #stacks
#plot heterozygosity
h<-plot.smooth.stats(df=ht,smooth=ht.pp.smooth,stat.name="Het",
color=comp.col["Het"],fst.name="Het",chrom.name = "Chrom",bp.name="Pos",
ylabel=expression(italic(H)),
axis=0.75,scaffold.widths=scaff.starts,pch=19,scaffs.to.plot = scaffs)
xlab.lgs(ht,lgs,lgn,"Chrom","plot.pos",adj=1, xpd=TRUE,y=-0.05,cex=0.75)
dev.off()
Figure x. Smoothed variables
bf$SNP<-paste(bf$scaffold,as.numeric(as.character(bf$BP))+1,sep=".")
bf.co<-apply(bf[,5:7],2,quantile,0.99) #focus on Bayes Factors, because of Lotterhos & Whitlock (2015)
sal.bf.sig<-bf[bf$Salinity_BF>bf.co["Salinity_BF"],c(1,2,4,8,6,9)]
all.out<-data.frame(rbind(cbind(paste(fw.sig.reg$Chr,fw.sig.reg$BP,sep="."),"fst"),
cbind(dd.bp.smooth[[1]]$SNP,"deltad"),
cbind(ht.bp.smooth[[1]]$SNP,"het"),
cbind(jd.bp.smooth[[1]]$SNP,"jostd"),
cbind(pi.bp.smooth[[1]]$SNP,"pi"),
cbind(sal.bf.sig$SNP,"bayenv_salinity")),stringsAsFactors = FALSE)
colnames(all.out)<-c("SNP","test")
write.table(all.out,"all_outliers.txt",sep='\t',quote=FALSE,col.names=TRUE,row.names=FALSE)
Here, I compare the Fst outliers and \(\delta\)-divergence outliers to putative genes identified from other studies of teleosts adapting to freshwater environments.
#putative genes
put.genes<-read.delim("putative_genes.txt",header=TRUE,sep='\t')
#ld info
ld<-read.delim("p4.ld_info_fwsw.txt")
gff.name<-"ssc_2016_12_20_chromlevel.gff.gz"
#genome annotations
if(length(grep("gz",gff.name))>0){
gff<-read.delim(gzfile(paste("../../scovelli_genome/",gff.name,sep="")),header=F)
} else{
gff<-read.delim(paste("../../scovelli_genome/",gff.name,sep=""),header=F)
}
colnames(gff)<-c("seqname","source","feature","start","end","score","strand","frame","attribute")
genome.blast<-read.csv("../../scovelli_genome/ssc_2016_12_20_cds_nr_blast_results.csv",skip=1,header=T)#I saved it as a csv
These are updated from fwsw_analysis.R and are specific to these analyses, and are not generalizable!
ssc.fwsw.fstsig<-function(tx, la, al, fl, cutoff)
{
tx.sig<-tx[tx$Fisher.s.P<cutoff,"Locus.ID"]
la.sig<-la[la$Fisher.s.P<cutoff,"Locus.ID"]
al.sig<-al[al$Fisher.s.P<cutoff,"Locus.ID"]
fl.sig<-fl[fl$Fisher.s.P<cutoff,"Locus.ID"]
length(tx.sig[(tx.sig %in% c(la.sig,al.sig,fl.sig))])
length(la.sig[(la.sig %in% c(tx.sig,al.sig,fl.sig))])
length(al.sig[(al.sig %in% c(la.sig,tx.sig,fl.sig))])
all.shared<-fl.sig[fl.sig %in% la.sig & fl.sig %in% al.sig & fl.sig %in% tx.sig]
fw.shared.chr<-fwsw.tx[fwsw.tx$Locus.ID %in% all.shared,c("Locus.ID","Chr","BP","Column","Overall.Pi")]
tapply(fw.shared.chr$Locus.ID,factor(fw.shared.chr$Chr),function(x){ length(unique(x)) })
#are they using the same SNPs or different SNPs?
stacks.sig<-data.frame(nrow=nrow(fw.shared.chr),ncol=4)
for(i in 1:nrow(fw.shared.chr)){
tx.bp<-fwsw.tx[tx$Fisher.s.P<cutoff & tx$Locus.ID == fw.shared.chr[i,"Locus.ID"],"BP"]
la.bp<-fwsw.la[la$Fisher.s.P<cutoff & la$Locus.ID == fw.shared.chr[i,"Locus.ID"],"BP"]
al.bp<-fwsw.al[al$Fisher.s.P<cutoff & al$Locus.ID == fw.shared.chr[i,"Locus.ID"],"BP"]
fl.bp<-fwsw.fl[fl$Fisher.s.P<cutoff & fl$Locus.ID == fw.shared.chr[i,"Locus.ID"],"BP"]
stacks.sig[i,1]<-paste(tx.bp,sep=",",collapse = ",")
stacks.sig[i,2]<-paste(la.bp,sep=",",collapse = ",")
stacks.sig[i,3]<-paste(al.bp,sep=",",collapse = ",")
stacks.sig[i,4]<-paste(fl.bp,sep=",",collapse = ",")
}
colnames(stacks.sig)<-c("TX","LA","AL","FL")
stacks.sig<-data.frame(cbind(fw.shared.chr,stacks.sig))
stacks.sig$SNP<-paste(stacks.sig$Chr,stacks.sig$BP,sep=".")
return(stacks.sig)
}
#' extract the significant regions from the gff file
sig.region.ann<-function(fw.shared.chr,gff,genome.blast)
{
fw.sig.reg<-do.call(rbind,apply(fw.shared.chr,1,function(sig){
this.gff<-gff[as.character(gff$seqname) %in% as.character(unlist(sig["Chr"])),]
this.reg<-this.gff[this.gff$start <= as.numeric(sig["BP"]) & this.gff$end >= as.numeric(sig["BP"]),]
if(nrow(this.reg) == 0){
if(as.numeric(sig["BP"])>max(as.numeric(this.gff$end))){
new<-data.frame(Locus=sig["Locus.ID"],Chr=sig["Chr"],BP=sig["BP"],SNPCol=sig["Column"],
region="beyond.last.contig", description=NA,SSCID=NA)
}else{
new<-data.frame(Locus=sig["Locus.ID"],Chr=sig["Chr"],BP=sig["BP"],SNPCol=sig["Column"],
region=NA,description=NA,SSCID=NA)
}
}else{
if(length(grep("SSCG\\d+",this.reg$attribute))>0){
geneID<-unique(gsub(".*(SSCG\\d+).*","\\1",this.reg$attribute[grep("SSCG\\d+",this.reg$attribute)]))
gene<-genome.blast[genome.blast$sscv4_gene_ID %in% geneID,"blastp_hit_description"]
}else{
geneID<-NA
gene<-NA
}
new<-data.frame(Locus=sig["Locus.ID"],Chr=sig["Chr"],BP=sig["BP"],SNPCol=sig["Column"],
region=paste(this.reg$feature,sep=",",collapse = ","),description=gene,SSCID=geneID)
}
return(as.data.frame(new))
}))
}
#' extract the significant regions from the gff file
put.reg<-do.call(rbind,apply(put.genes,1,function(gene){
ssc.gene<-as.character(gene[[6]])
#there might be multiple matches
ssc.genes<-unlist(strsplit(ssc.gene,","))
#for each gene, match to gff
gene.info<-do.call(rbind,lapply(ssc.genes,function(genename){
this.gff<-gff[grep(genename,gff$attribute),]
chrom<-unique(as.character(this.gff$seqname))
start<-min(this.gff$start)
stop<-max(this.gff$end)
return(c(chrom,start,stop))
}))
if(as.character(gene[[3]]) == "") { gene[[3]]<-NA }
new<-data.frame(Gene=rep(gene[[1]],length(gene.info[,1])),
Function=rep(gene[[2]],length(gene.info[,1])),
Chromsome=rep(gene[[3]],length(gene.info[,1])),
Species=rep(gene[[4]],length(gene.info[,1])),
Citation=rep(gene[[5]],length(gene.info[,1])),
Scovelli_geneID=rep(gene[[6]],length(gene.info[,1])),
Notes=rep(gene[[7]],length(gene.info[,1])),
Chrom=gene.info[,1],StartBP=gene.info[,2],StopBP=gene.info[,3],
stringsAsFactors = FALSE)
return(as.data.frame(new))
}))
write.table(put.reg,"putative.gene.regions.tsv",col.names=T,sep='\t')
Note that some genes will have multiple rows as they match multiple locations in the genome.
put.reg$rad.loci<-apply(put.reg,1,function(gene){
rad<-vcf[vcf$`#CHROM` %in% gene[["Chrom"]] &
vcf$POS >= as.numeric(as.character(gene[["StartBP"]])) &
vcf$POS <= as.numeric(as.character(gene[["StopBP"]])),]
if(nrow(rad)>0){ rad.snps<-paste(rad$SNP,sep=",",collapse=",") }
else { rad.snps<-NA }
return(rad.snps)
})
About 48.1818182% of the putative genes have RAD loci within them - but only about 22.037037% of all of the annotations have RAD loci.
\(D'\) was calculated using a custom C++ program (found at https://github.com/spflanagan/SCA/tree/master/programs/pairwise_ld_vcf) using the formula:
\[D' = \sum_{i=1}^m\sum_{j=1}^n \frac{p_iq_j|D_{ij}|}{D_{max}},\] where \(m\) is the number of alleles at locus and \(n\) is the number of alleles at locus B. \(D_{ij}\) was calculated as \(f_{ij}-p_iq_j\), where \(f_{ij}\) is the frequency of the \(A_iB_j\) haplotype, \(p_i\) is the frequency of allele \(i\), and \(q_j\) is the frequency of allele \(j\). \(D_{max}\) is the lesser of \(p_iq_j\) or \((1-p_i)(1-q_j)\) when \(D_{ij}\) < 0 and is the minimum of (1 - \(p_{i}\))\(q_{i}\) or \(p_i(1-q_i)\) when \(D_{ij}\) > 0.
In the data.frame ld, the locus IDs (LocIDA and LocIDB) are the positions on the chromosome.
ld$pos.diff<-abs(ld$LocIDA - ld$LocIDB)
chrom.ld<-tapply(ld[ld$D.>0.5,"pos.diff"],ld[ld$D. >0.5,"Chrom"],mean)
#' Find outliers in a putative gene region
outlier.in.region<-function(outlier.df, region.df, oChrom="Chrom", oBP="BP",
rChrom="Chrom",rBPStart="StartBP",rBPStop="StopBP"){
out.in<-apply(region.df,1,function(put.gene){
oin<-outlier.df[outlier.df[[oChrom]] %in% put.gene[[rChrom]]
& outlier.df[[oBP]] >= as.numeric(as.character(put.gene[[rBPStart]]))
& outlier.df[[oBP]] <= as.numeric(as.character(put.gene[[rBPStop]])),"SNP"]
if(length(oin)==0){ oin<-NA }
return(paste(oin,sep=",",collapse=","))
})
return(out.in)
}
outlier.nearby<-function(outlier.df,region.df,ld.df,
oChrom="Chrom", oBP="BP",
rChrom="Chrom",rBPStart="StartBP",rBPStop="StopBP"){
out.nearby<-apply(region.df,1,function(put.gene){
nearby<-outlier.df[outlier.df[[oChrom]] %in% put.gene[[rChrom]]
& outlier.df[[oBP]] >= (as.numeric(as.character(put.gene[[rBPStart]])) - (ld.df[put.gene[[rChrom]] ]))
& outlier.df[[oBP]] <= (as.numeric(as.character(put.gene[[rBPStop]])) + (ld.df[put.gene[[rChrom]] ])),"SNP"]
if(length(nearby)==0){ nearby<-NA }
return(paste(nearby,sep=",",collapse=","))
})
}
#' Get the plotting positions for the putative genes
all.chr<-data.frame(Chr=vcf$`#CHROM`,BP=vcf$POS,stringsAsFactors = F)
bounds<-data.frame(levels(as.factor(all.chr$Chr)),tapply(as.numeric(as.character(all.chr$BP)),all.chr$Chr,max))
colnames(bounds)<-c("Chrom","End")
plot.scaffs<-scaffs[scaffs %in% bounds$Chrom]
#convert put.reg to the plotting BP
new.dat<-data.frame(stringsAsFactors = F)
last.max<-0
for(i in 1:length(plot.scaffs)){
#pull out the data for this scaffold
if(nrow(bounds[bounds$Chrom %in% plot.scaffs[i],])>0){ #sanity check
chrom.dat<-put.reg[put.reg$Chrom %in% plot.scaffs[i],]
if(nrow(chrom.dat)>0){
chrom.dat$plot.min<-as.numeric(as.character(chrom.dat$StartBP))+last.max
chrom.dat$plot.max<-as.numeric(as.character(chrom.dat$StopBP))+last.max
new.dat<-rbind(new.dat,chrom.dat)
#last.max<-max(chrom.dat$plot.pos)+
# as.numeric(scaffold.widths[scaffold.widths[,1] %in% scaffs.to.plot[i],2])
}
last.max<-last.max+
as.numeric(bounds[bounds$Chrom %in% plot.scaffs[i],2])
}else{
print(paste(plot.scaffs[i], "has no designated width."))
}
}
#make sure everything is the correct class
new.dat$plot.min<-as.numeric(as.character(new.dat$plot.min))
new.dat$plot.max<-as.numeric(as.character(new.dat$plot.max))
fw.sig.reg<-read.csv("p4.StacksFWSWOutliers_annotatedByGenome.csv")
all.put.ssc<-as.character(put.reg$Scovelli_geneID)
all.put.ssc<-unlist(lapply(all.put.ssc,strsplit,","))
sig.put.match<-all.put.ssc[all.put.ssc %in% fw.sig.reg$SSCID]
#or use the functions
fw.sig.reg$SNP<-paste(fw.sig.reg$Chr,as.character(fw.sig.reg$BP),sep=".")
stacks.in<-outlier.in.region(fw.sig.reg,put.reg,oChrom="Chr")
put.reg$fst.in<-as.character(stacks.in)
Of the 54 shared outlier SNPs, 27 are in coding regions of the genome. However, 2 shared outliers are in putative gene regions, namely APOL, SPG1
No, they don’t overlap. Maybe they’re in the same LD region?
put.nearby.rad<-outlier.nearby(fw.sig.reg,put.reg,chrom.ld,oChrom = "Chr")
#add the nearby significant rad loci to the put.reg data.frame
put.reg$nearby.rad<-put.nearby.rad
head(put.reg[put.reg$nearby.rad!="NA",])
## Gene
## 1 ABCB7
## 2 ABLIM3
## 3 ADAM19
## 4 ADAM19
## 6 ADRB2
## 7 ADRB2
## Function
## 1 iron metabolism
## 2 actin-binding
## 3 osteoblast differentiation
## 4 osteoblast differentiation
## 6 bone density and mineralization
## 7 bone density and mineralization, tooth organogenesis, craniofacial development
## Chromsome Species Citation
## 1 <NA> Gasterosteus aculeatus Jones et al 2012b
## 2 <NA> Gasterosteus aculeatus Hohenlohe et al 2010
## 3 IV Gasterosteus aculeatus Hohenlohe et al 2010
## 4 IV Gasterosteus aculeatus Hohenlohe et al 2010
## 6 VII Gasterosteus aculeatus Hohenlohe et al 2010
## 7 IV Gasterosteus aculeatus Hohenlohe et al 2010
## Scovelli_geneID Notes Chrom StartBP StopBP rad.loci
## 1 SSCG00000000949 <NA> LG14 5689975 5711169 <NA>
## 2 SSCG00000004181 <NA> LG14 12122996 12144915 <NA>
## 3 SSCG00000004162,SSCG00000007962 <NA> LG14 11860301 11877333 <NA>
## 4 SSCG00000004162,SSCG00000007962 <NA> LG14 22306738 22325075 <NA>
## 6 SSCG00000004169 <NA> LG14 12061583 12065881 <NA>
## 7 SSCG00000004169 <NA> LG14 12061583 12065881 <NA>
## fst.in nearby.rad
## 1 NA LG14.3289190,LG14.3352883,LG14.3960463,LG14.5029044
## 2 NA LG14.5029044,LG14.13689654,LG14.18295431
## 3 NA LG14.5029044,LG14.13689654,LG14.18295431
## 4 NA LG14.18295431
## 6 NA LG14.5029044,LG14.13689654,LG14.18295431
## 7 NA LG14.5029044,LG14.13689654,LG14.18295431
So of the 540 gene annotations 229 are within the mean LD range of an outlier, which represent 67 putative freshwater genes. Those genes are: ABCB7, ABLIM3, ADAM19, ADRB2, AFAP1L1, angiotensin II receptor, ANXA6, APOL, AQP3, ARHGEF3, ATP6V0A1, ATP6V1A, AVPR2, BGN, CA, CAII, CAMKK1, CD63, CEBPD, CFTR, cortisol, ECaC, EDA, EFNB1, FBLN1, FERH1, FZD2, HRH2, IGF-I, IGFBP5, IGFBP6, IL12B, KCNH4, KITLG, LEPR, LTB4R, MAPK11, MAPK12, MSX2, mucin, NAKATPase, NBC1, NCX, NHE, NKCC, NR4A1, PDLIM7, PMCA, PODXL, PRKCD, PRL, RDH10, RHOA, RHOGTP8, rtCR2, SCUBE1, SLC26A3, SLC2A13, SPG1, SYNPO, TBX2, TRIM14, UT, VATPase, WNT5A, WNT7B, ZEB1
Now for \(\delta\) -divergence
#in the gene
sdd.in<-outlier.in.region(dd.bp.smooth[[1]],put.reg,oBP = "Pos")
put.reg$sdd.in<-sdd.in
The genes ARHGEF3, OSBPL8, RHOGTP8 contain \(\delta\)-divergence outlier regions.
#in the gene
sdd.nb<-outlier.nearby(dd.bp.smooth[[1]],put.reg,chrom.ld,oBP = "Pos")
put.reg$sdd.nb<-sdd.nb
And of the 154 \(\delta\)-divergence outliers, 91.4814815% are in the LD region around putative freshwater genes.
#taken directly from fwsw_analysis.R
bf<-read.delim("bayenv/p4.bf.txt",header=T)
bf$SNP<-paste(bf$scaffold,as.numeric(as.character(bf$BP))+1,sep=".")
bf.co<-apply(bf[,5:7],2,quantile,0.99) #focus on Bayes Factors, because of Lotterhos & Whitlock (2015)
temp.bf.sig<-bf[bf$Temp_BF>bf.co["Temp_BF"],c(1,2,4,8,5,9)]
sal.bf.sig<-bf[bf$Salinity_BF>bf.co["Salinity_BF"],c(1,2,4,8,6,9)]
grass.bf.sig<-bf[bf$seagrass_BF>bf.co["seagrass_BF"],c(1,2,4,8,7,9)]
#get the log transformed Bayes Factors
bf$logSal<-log(bf$Salinity_BF)
bf$logTemp<-log(bf$Temp_BF)
bf$logSeagrass<-log(bf$seagrass_BF)
There are 0 overlapping outliers between temperature-, salinity-, and seagrass-associated loci.
But if we only care about salinity ones, there are 91 outliers.
Are any of the Bayenv salinity outliers near the putative freshwater genes?
bfs.in<-outlier.in.region(sal.bf.sig,put.reg,"scaffold")
bfs.nb<-outlier.nearby(sal.bf.sig,put.reg,chrom.ld,"scaffold")
Of the 9053 RAD loci analyzed by Bayenv2 for associations with temperature, 0.3703704% are in putative freshwater genes and 75.5555556% are within the LD neighborhood.
Are any of the Bayenv temperature outliers near putative freshwater genes?
bft.in<-outlier.in.region(temp.bf.sig,put.reg,"scaffold")
bft.nb<-outlier.nearby(temp.bf.sig,put.reg,chrom.ld,"scaffold")
Of the 9053 RAD loci analyzed by Bayenv2 for associations with temperature, 0.5555556% are in putative freshwater genes and 80.1851852% are within the LD neighborhood.
Are any of the loci associated with seagrass density in or near putative freshwater genes?
bfg.in<-outlier.in.region(grass.bf.sig,put.reg,"scaffold")
bfg.nb<-outlier.nearby(grass.bf.sig,put.reg,chrom.ld,"scaffold")
Of the 9053 RAD loci analyzed by Bayenv2 for associations with temperature, 0.1851852% are in putative freshwater genes and 77.962963% are within the LD neighborhood.
put.reg$bfs.in<-bfs.in
put.reg$bft.in<-bft.in
put.reg$bfg.in<-bfg.in
unique(put.reg[put.reg$bfs.in !="NA","Gene"])
## [1] ARHGEF3 NHE
## 110 Levels: ABCB7 ABLIM3 ADAM19 ADAMTS10 ADRB2 AE ... ZEB1
unique(put.reg[put.reg$bft.in !="NA","Gene"])
## [1] APOL NHE TRIM14
## 110 Levels: ABCB7 ABLIM3 ADAM19 ADAMTS10 ADRB2 AE ... ZEB1
unique(put.reg[put.reg$bfg.in !="NA","Gene"])
## [1] ARHGEF3
## 110 Levels: ABCB7 ABLIM3 ADAM19 ADAMTS10 ADRB2 AE ... ZEB1
All three BayEnv tests identified outliers in the genes . The temperature and grass also share NA, and temperature and salinity share NHE.
Jost’s D measures the fraction of allelic differentiation between populations. According to Molecular Ecologist, “if allelic differentiation at a particular locus is the value of interest, it appears that D is the best measure”, so it might be useful to look at Jost’s D in the putatively freshwater genes.
jost.in<-outlier.in.region(jd.bp.smooth[[1]],put.reg,oChrom="Chrom",oBP="Pos")
jost.nb<-outlier.nearby(jd.bp.smooth[[1]],put.reg,chrom.ld,oChrom = "Chrom",oBP="pos")
put.reg$jostd.nb<-jost.nb
put.reg$jostd.in<-jost.in
So \(1\) loci are located within putative freshwater genes, but \(0\) are within the LD region of the putative genes, representing \(0\) of the putative gene annotations and \(0\) genes.
kable(unique(put.reg[put.reg$fst.in != "NA",c("Gene","Chrom")]),caption = "Putative genes containing Fst outliers",row.names = FALSE)
| Gene | Chrom |
|---|---|
| APOL | LG14 |
| SPG1 | LG1 |
kable(unique(put.reg[put.reg$sdd.in != "NA",c("Gene","Chrom")]),
caption="Putative genes containing delta divergence outliers",row.names = FALSE)
| Gene | Chrom |
|---|---|
| ARHGEF3 | LG13 |
| OSBPL8 | LG17 |
| RHOGTP8 | LG15 |
| RHOGTP8 | LG20 |
kable(unique(put.reg[put.reg$jostd.in !="NA",c("Gene","Chrom")]),
caption="Putative genes containing Jost's D outliers",row.names = FALSE)
| Gene | Chrom |
|---|---|
| NHE | LG11 |
kable(unique(put.reg[put.reg$bfs.in != "NA",c("Gene","Chrom")]),
caption="Putative genes containing salinity-associated SNPs",row.names = FALSE)
| Gene | Chrom |
|---|---|
| ARHGEF3 | LG14 |
| NHE | LG11 |
kable(unique(put.reg[put.reg$fst.in != "NA",c("Gene","Chrom")])[
unique(put.reg[put.reg$fst.in != "NA","Gene"]) %in% unique(put.reg[put.reg$sdd.in != "NA","Gene"])&
unique(put.reg[put.reg$fst.in != "NA","Gene"]) %in% unique(put.reg[put.reg$bfs.in != "NA","Gene"])],
caption="Putative genes containing all of those outliers",row.names=FALSE)
Table: Putative genes containing all of those outliers
kable(unique(put.reg[put.reg$fst.in != "NA",c("Gene","Chrom")])[
unique(put.reg[put.reg$fst.in != "NA","Gene"]) %in% unique(put.reg[put.reg$sdd.in != "NA","Gene"])&
unique(put.reg[put.reg$fst.in != "NA","Gene"]) %in% unique(put.reg[put.reg$jostd.in != "NA","Gene"])],
caption="Putative genes containing divergence outliers (Fst, deltad, D)",row.names = FALSE)
Table: Putative genes containing divergence outliers (Fst, deltad, D)
kable(unique(put.reg[put.reg$nearby.rad != "NA","Gene"])[
unique(put.reg[put.reg$nearby.rad != "NA","Gene"]) %in% unique(put.reg[put.reg$sdd.nb != "NA","Gene"])&
unique(put.reg[put.reg$nearby.rad != "NA","Gene"]) %in% unique(put.reg[put.reg$jostd.in!= "NA","Gene"])],
caption="Putative genes near divergence outliers (Fst, deltad, D)",row.names=FALSE)
| x |
|---|
| NHE |
But looking at put.reg[put.reg$Gene =="RHOGTP8","rad.loci"], it’s obvious that Rho GTPase-activating proteins are distributed throughout the genome (on chromosomes LG11, LG15, LG20, LG14, LG1, scaffold_1578, LG12, LG9, LG8, LG22, LG18, LG5, scaffold_1008, LG6, scaffold_950, LG10, LG7, scaffold_139, LG21, LG16, LG19).
Find genomic regions with high \(\pi\) and low \(\delta\)-divergence
#Do high pi and het have low deltad?
#do it per chrom
balancing.sel<-function(pi.out,pi,ht.out,ht,
dd.out,dd,jd.out,jd){
pi.upp<-pi.out[pi.out$direction=="upper",]
ht.upp<-ht.out[ht.out$direction=="upper",]
dd.low<-dd.out[dd.out$direction=="lower",]
jd.low<-jd.out[jd.out$direction=="lower",]
shared.div<-pi.upp[pi.upp[,pi] %in% ht.upp[,ht],]
shared.dif<-dd.low[dd.low[,dd] %in% jd.low[,jd],]
bal<-shared.div[shared.div[,pi] %in% shared.dif[,dd]]
return(list(div=shared.div,dif=shared.dif,bal=bal))
}
bal<-balancing.sel(pi.pp.smooth[[1]],"plot.pos",ht.pp.smooth[[1]],"plot.pos",
dd.pp.smooth[[1]],"plot.pos",jd.pp.smooth[[1]],"plot.pos")
shared.upp<-bal$div
There are loci with high \(\pi\) and low \(\delta\)-divergence on 13 chromosomes.
Instead of the lumped marine-freshwater \(F_{ST}\) values that I used originally, I’m going to plot the average of pairwise \(F_{ST}\) values.
fst.means<-fwsw.al
for(i in 1:nrow(fst.means)){
if(fst.means$Locus.ID[i] %in% fwsw.fl$Locus.ID &
fst.means$Locus.ID[i] %in% fwsw.la$Locus.ID &
fst.means$Locus.ID[i] %in% fwsw.tx$Locus.ID){
mu<-mean(fwsw.fl$Corrected.AMOVA.Fst[fwsw.fl$Locus.ID==fst.means$Locus.ID[i]],
fwsw.la$Corrected.AMOVA.Fst[fwsw.la$Locus.ID==fst.means$Locus.ID[i]],
fwsw.al$Corrected.AMOVA.Fst[fwsw.al$Locus.ID==fst.means$Locus.ID[i]],
fwsw.tx$Corrected.AMOVA.Fst[fwsw.tx$Locus.ID==fst.means$Locus.ID[i]])
fst.means$Corrected.AMOVA.Fst[i]<-mu
}
else{
fst.means$Corrected.AMOVA.Fst[i]<-NA
}
}
fst.means<-fst.means[!is.na(fst.means$Corrected.AMOVA.Fst),]
fwsw<-fst.means
fst.points<-FALSE
#and putative genes
put.genes<-read.delim("putative_genes.txt",header=TRUE,sep='\t')
#genome annotations
#put.reg<-read.delim("putative.gene.regions.tsv",header=T)
chrom.ld<-tapply(ld[ld$D.>0.5,"pos.diff"],ld[ld$D. >0.5,"Chrom"],mean)
#select genes of interest
#fav.genes<-c("AQP3","TNS1","CAMKK1","mucin","CAII","NAKATPase","ARHGEF3")
#fav.genes<-c("TNS1","CAII","TRIM14","VATPase")
fav.genes<-c("ARHGEF3", "mucin", "NHE", "TAAR")
genes2plot<-put.reg[put.reg$Gene %in% fav.genes,]
genes2plot$Gene<-as.character(genes2plot$Gene)
genes2plot$Gene[genes2plot$Gene=="ARHGEF3"]<-"ARHGEF" #rename
#shift LG3 names
genes2plot$StartBP[genes2plot$Chrom=="LG3" & genes2plot$Gene=="ARHGEF"]<-
genes2plot$StartBP[genes2plot$Chrom=="LG3" & genes2plot$Gene=="ARHGEF"]+200000
genes2plot$StartBP[genes2plot$Chrom=="LG3" & genes2plot$Gene=="mucin"]<-
genes2plot$StartBP[genes2plot$Chrom=="LG3" & genes2plot$Gene=="mucin"]-100000
#shift LG7 names
genes2plot$StartBP[genes2plot$Chrom=="LG7" & genes2plot$Gene=="ARHGEF"]<-
genes2plot$StartBP[genes2plot$Chrom=="LG7" & genes2plot$Gene=="ARHGEF"]+200000
genes2plot$StartBP[genes2plot$Chrom=="LG7" & genes2plot$Gene=="NHE"]<-
genes2plot$StartBP[genes2plot$Chrom=="LG7" & genes2plot$Gene=="NHE"]-200000
#shift LG14 names
genes2plot$StartBP[genes2plot$Chrom=="LG14" & genes2plot$Gene=="ARHGEF"]<-
genes2plot$StartBP[genes2plot$Chrom=="LG14" & genes2plot$Gene=="ARHGEF"]+550000
genes2plot$StartBP[genes2plot$Chrom=="LG14" & genes2plot$Gene=="NHE"]<-
genes2plot$StartBP[genes2plot$Chrom=="LG14" & genes2plot$Gene=="NHE"]-600000
#get rid of the final ARHGEF
genes2plot<-genes2plot[-which(genes2plot$Chrom=="LG14" & genes2plot$Gene=="ARHGEF" & genes2plot$StartBP>23022883),]
#shared Fst outliers
fw.sig.reg<-read.csv("p4.StacksFWSWOutliers_annotatedByGenome.csv")
h.pi.name<-"HandPi_subgenes.png"
row.settings<-c(4,4)
chroms2plot<-unique(shared.upp$Chr)
#generate xlims
xlims<-lapply(chroms2plot,function(lg,ld,genes,vcf){
xs<-c(min(genes[genes$Chr == lg,"plot.pos"]-ld[lg]),
max(genes[genes$Chr == lg,"plot.pos"]+ld[lg]))
if(xs[1] < 0){
xs[1]<-0
}
if(xs[2] > max(vcf$POS[vcf$`#CHROM`==lg])){
xs[2]<-max(vcf$POS[vcf$`#CHROM`==lg])
}
return(xs)
},ld=chrom.ld,genes=shared.upp,vcf=vcf)
#or do full chroms
#generate xlims
xlims<-lapply(chroms2plot,function(lg,genes,vcf){
xs<-c(min(vcf$POS[vcf$`#CHROM`==lg]),
max(vcf$POS[vcf$`#CHROM`==lg]))
return(xs)
},genes=shared.upp,vcf=vcf)
names(xlims)<-chroms2plot
HandPiName<-expression(Large~pi~and~italic(H))
includeArrows<-FALSE
#colors
comp.col<-c(Het="#80cdc1",pi="#018571",Fst="black",D="#a6611a",deltad="#dfc27d")
#find the plotting location
nearest.pos<-function(stat.df,s.pos,s.stat,target,t.pos,t.stat){
near.pos<-t(apply(stat.df,1,function(df,target){
x<-as.numeric(df[s.pos])
pos<-target[which.min(abs(x-as.numeric(target[,t.pos]))),t.pos]
if(pos==x){
stat<-as.numeric(target[which.min(abs(x-as.numeric(target[,t.pos]))),t.stat])
}else{
if(pos>x){
upp<-which.min(abs(x-target[,t.pos]))
low<-upp-1
}else{
low<-which.min(abs(x-target[,t.pos]))
upp<-low+1
}
upp.pt<-target[upp,c(t.pos,t.stat)]
low.pt<-target[low,c(t.pos,t.stat)]
slope<-as.numeric((upp.pt[2]-low.pt[2])/(upp.pt[1]-low.pt[1]))
b<-as.numeric(upp.pt[2]-(slope*upp.pt[1]))
stat<-slope*x+b
}
return(cbind(x,stat))
},target=target))
return(near.pos)
}
upp.low.pts<-function(smooth,target,chrom,color,stat,pos.name,...){
#smooth== smooth[[1]]
#target== smooth[[2]]
upp<-smooth[smooth[,"Chrom"]%in% chrom & smooth$direction=="upper",]
low<-smooth[smooth[,"Chrom"]%in% chrom & smooth$direction=="lower",]
upp.pts<-nearest.pos(stat.df=upp,s.pos=pos.name,s.stat=stat,
target=target[target[,"chr"]%in% chrom,],
t.pos="pos",t.stat="smoothed.stats")
low.pts<-nearest.pos(stat.df=low,s.pos=pos.name,s.stat=stat,
target=target[target[,"chr"]%in% chrom,],
t.pos="pos",t.stat="smoothed.stats")
points(x=upp.pts[,1],y=upp.pts[,2],pch=24,bg=color,col=color,...)
points(x=low.pts[,1],y=low.pts[,2],pch=25,bg=color,col=color,...)
}
png(h.pi.name,height=8,width=14,units="in",res=300)
par(mfrow=row.settings,oma=c(1,1,1,1),mar=c(1,2,2,1))
for(i in 1:length(chroms2plot)){
this.df<-fwsw[fwsw$Chr %in% chroms2plot[i],]
this.xlim<-xlims[[as.character(chroms2plot[i])]]
plot(this.df$BP,this.df$Corrected.AMOVA.Fst, xlim=this.xlim,ylim=c(-0.2,0.5),axes=F,ylab="",xlab="",type='n')
xmin<-this.xlim[1]#min(pi.plot$Pos[pi.plot$Chrom%in%chroms2plot[i]])
xmax<-this.xlim[2]#max(pi.plot$Pos[pi.plot$Chrom%in%chroms2plot[i]])
#the shared peaks
p<-lapply(shared.upp$Pos[shared.upp$Chrom %in% chroms2plot[i]],
function(pos){
points(y=c(-0.2,0.5),x=c(pos,pos),
type="l",col=alpha("#543005",0.75),cex=2,lwd=4)
})
points(y=rep(c(-0.2,0.5),length(shared.upp$plot.pos[shared.upp$Chrom %in% chroms2plot[i]])),
x=c(shared.upp$plot.pos[shared.upp$Chrom %in% chroms2plot[i]],
shared.upp$plot.pos[shared.upp$Chrom %in% chroms2plot[i]]),
type="l",col=alpha("#543005",0.75),cex=2,lwd=4)
#putative gene regions
g<-genes2plot[genes2plot$Chrom %in% chroms2plot[i] &
genes2plot$StartBP >= xmin & genes2plot$StartBP <= xmax,]
a<-put.reg[put.reg$Chrom %in% chroms2plot[i] & !(put.reg$Gene %in% fav.genes),]
if(nrow(g) > 0){
rect(xleft=as.numeric(g$StartBP),xright=as.numeric(g$StopBP),
ybottom=-0.2,ytop=0.44,col="indianred",border="indianred")
}
#Fst
if(fst.points==TRUE){
points(this.df$BP,this.df$Corrected.AMOVA.Fst,pch=19,cex=1.5,
col=alpha(col=comp.col["Fst"],0.25),bg=alpha(col=comp.col["Fst"],0.25))
}
#Pi
points(pi.bp.smooth[[2]][pi.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",lwd=2,col=comp.col["pi"])
if(isTRUE(includeArrows)){
upp.low.pts(smooth=pi.bp.smooth[[1]],target=pi.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["pi"],stat="Pi",pos.name="Pos")
}
#Het
points(ht.bp.smooth[[2]][ht.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["Het"],lwd=2)
if(isTRUE(includeArrows)){
upp.low.pts(smooth=ht.bp.smooth[[1]],target=ht.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["Het"],stat="Het",pos.name="Pos")
}
#deltad
points(dd.bp.smooth[[2]][dd.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["deltad"],lwd=2)
if(isTRUE(includeArrows)){
upp.low.pts(smooth=dd.bp.smooth[[1]],dd.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["deltad"],stat="deltad",pos.name="Pos")
}
#Josts D
points(jd.bp.smooth[[2]][jd.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["D"],lwd=2)
if(isTRUE(includeArrows)){
upp.low.pts(smooth=jd.bp.smooth[[1]],jd.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["D"],stat="D",pos.name="Pos")
}
#shared Fst outliers
points(this.df$BP[this.df$BP %in% fw.sig.reg$BP],
rep(0.5,length(this.df$BP[this.df$BP %in% fw.sig.reg$BP])),
pch="|",cex=2,col="orchid4",lwd=3)
if(nrow(g)>0){
txt.locs<-data.frame(starts=unique(g$StartBP),name=g$Gene[!duplicated(g$StartBP)])
txt.locs<-txt.locs[order(txt.locs$starts),]
text(x=txt.locs$starts,y=0.35,cex=2,labels=txt.locs$name,srt=90,xpd=T)
}
#axes etc
axis(1,pos=-0.2,c(xmin,xmax),
labels = c(round((xmin/1000000),2),round((xmax/1000000),2)),cex.axis=2)
axis(2,las=1,hadj=0.75,cex.axis=2,pos=xmin,at=c(-0.25,0,0.25,0.5))
mtext(chroms2plot[i],1,cex=2*0.75,line=1)
}
#par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0),mar=c(0, 0, 0, 0), new=TRUE)
plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
legend("topleft",xpd=TRUE,
legend=c("Putative Gene",
expression(Shared~italic(F)[ST]~Outlier),
HandPiName),
bty='n',pch=c(15,124,15),#lwd=c(2,1,4,2,2,0,2,2),lty=c(1,0,1,1,1,0,1,1),
cex = 2,x.intersp = 0.5,y.intersp = 0.75,text.width=0.45,
col=c("indianred","orchid4",alpha("#543005",0.75)))
plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
legend("topleft",xpd=TRUE,
legend=c(expression(italic(H)),
expression(pi),#expression(FW-SW~italic(F)[ST]),
expression("Jost's"~italic(D)),
expression(delta~-divergence)),
bty='n',pch=c(15,15,15,15),#lwd=c(2,1,4,2,2,0,2,2),lty=c(1,0,1,1,1,0,1,1),
cex = 2,x.intersp = 0.5,y.intersp = 0.75,text.width=0.45,
col=c(comp.col[1:2],#alpha(col=comp.col["Fst"],0.25)
comp.col[4:5]))
dev.off()
## png
## 2
Figure 6. Low \(\delta\)-divergence and high \(\pi\)
To demonstrate these patterns, let’s focus on the clearest signal, on LG8. I’ll do three plots, one for pi and H, one for \(\delta\)-divergence and Jost’s D, and one with the four FWSW Fst comparisons.
## SETUP
comp.col<-c(Het="#80cdc1",pi="#018571",Fst="black",D="#a6611a",deltad="#dfc27d")
this.chrom<-"LG8"
i<-which(chroms2plot==this.chrom) #LG8 is the second in the 'chroms2plot' list
this.df<-fwsw[fwsw$Chr %in% chroms2plot[i],]
this.xlim<-xlims[[as.character(chroms2plot[i])]]
xmin<-this.xlim[1]
xmax<-this.xlim[2]
# Find the shared peak
pos<-shared.upp$Pos[shared.upp$Chrom %in% chroms2plot[i]]
regmin<-pos-mean(ld$pos.diff[ld$Chrom==this.chrom & ld$LocIDA==pos & ld$D. > 0.5])
regmax<-pos+mean(ld$pos.diff[ld$Chrom==this.chrom & ld$LocIDA==pos & ld$D. > 0.5])
# Find the putative gene regions
g<-put.reg[put.reg$Chrom == this.chrom,c("Gene","Chrom","StartBP","StopBP")]
#remove duplicates
if(this.chrom=="LG8"){
g<-g[g$Gene !="ATP6V1A",]
g<-g[-which(g$Gene=="VATPase" & g$StartBP<3000000),]
}
g<-g[!duplicated(g),]
txt.locs<-data.frame(starts=unique(g$StartBP),name=as.character(g$Gene[!duplicated(g$StartBP)]),
stringsAsFactors = FALSE)
txt.locs<-txt.locs[order(txt.locs$starts),]
rownames(txt.locs)<-NULL
#move the labels
if(this.chrom=="LG8"){
txt.locs$starts[12]<-txt.locs$starts[12]+100000 #VATPase
txt.locs$starts[10]<-txt.locs$starts[10]-100000 #MKP8
txt.locs$starts[11]<-txt.locs$starts[11]-100000 #mucin
txt.locs$starts[txt.locs$name=="RHOGTP8"]<-txt.locs$starts[txt.locs$name=="RHOGTP8"]+50000 #RhoGTP
txt.locs$starts[5]<-txt.locs$starts[5]+50000 #PRL2
txt.locs$starts[1]<-txt.locs$starts[1]+90000 #TRIM14
txt.locs$starts[2]<-txt.locs$starts[2]+100000 #VATPase
txt.locs$starts[3]<-txt.locs$starts[3]+100000 #PRL1
#rename
txt.locs$name[txt.locs$name=="ATP6V0A1"]<-"VATPase"
txt.locs$name[txt.locs$name=="RHOGTP8"]<-"RhoGTP"
}
## PLOT
png(paste(this.chrom,"_balancingsel.png",sep=""),height=6,width=10,units="in",res=300)
par(mfrow=c(3,1),oma=c(2,2,2,1),mar=c(1,2,1.5,2),xpd=TRUE)
# Empty plot for H and pi
plot(this.df$BP,this.df$Corrected.AMOVA.Fst, xlim=this.xlim,ylim=c(0,0.5),axes=F,ylab="",xlab="",type='n')
# Add the shared peak
points(y=c(0,0.5),x=c(pos,pos),type="l",col=alpha("#543005",0.75),cex=2,lwd=4)
rect(regmin,0,regmax,0.5,col=alpha("#543005",0.15),border=NA) #LD region
# Add the putative gene regions
rect(xleft=as.numeric(g$StartBP),xright=as.numeric(g$StopBP),
ybottom=0,ytop=0.5,col="indianred",border="indianred")
# Add Pi
points(pi.bp.smooth[[2]][pi.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",lwd=2,col=comp.col["pi"])
#upp.low.pts(smooth=pi.bp.smooth[[1]],target=pi.bp.smooth[[2]],chrom=chroms2plot[i],
# color=comp.col["pi"],stat="Pi",pos.name="Pos")
# Add Het
points(ht.bp.smooth[[2]][ht.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["Het"],lwd=2)
#upp.low.pts(smooth=ht.bp.smooth[[1]],target=ht.bp.smooth[[2]],chrom=chroms2plot[i],
# color=comp.col["Het"],stat="Het",pos.name="Pos")
# Add putative gene name
#text(x=txt.locs$starts[4:12],y=c(0.25,rep(0.35,3),rep(0.25,5)),
# cex=2,labels=txt.locs$name[4:12],srt=90,xpd=T,font=2)
#text(x=txt.locs$starts[1:3],y=c(0.43,0.3,0.05),cex=2,labels=txt.locs$name[1:3],xpd=T,font=2,pos=2)
# axes etc
axis(1,pos=0,c(xmin,xmax),labels = FALSE)
axis(2,las=1,cex.axis=2,pos=xmin,at=c(0,0.25,0.5))
legend(0,0.65,legend=c(expression(italic(H)),expression(pi)),
bty='n',lwd=c(2,2),lty=c(1,1),xpd=TRUE,ncol=2,
cex = 2,col=c(comp.col["Het"],comp.col["pi"]),x.intersp = 0.2)
# Empty plot for D and delta-divergence
plot(this.df$BP,this.df$Corrected.AMOVA.Fst, xlim=this.xlim,ylim=c(-0.1,0.2),axes=F,ylab="",xlab="",type='n')
# Add the shared peak
points(y=c(-0.1,0.2),x=c(pos,pos),type="l",col=alpha("#543005",0.75),cex=2,lwd=4)
rect(regmin,-0.1,regmax,0.2,col=alpha("#543005",0.15),border=NA) #LD region
# Add the putative gene regions
rect(xleft=as.numeric(g$StartBP),xright=as.numeric(g$StopBP),
ybottom=-0.1,ytop=0.2,col="indianred",border="indianred")
# Add deltad
points(dd.bp.smooth[[2]][dd.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["deltad"],lwd=2)
#upp.low.pts(smooth=dd.bp.smooth[[1]],dd.bp.smooth[[2]],chrom=chroms2plot[i],
# color=comp.col["deltad"],stat="deltad",pos.name="Pos")
# Add Josts D
points(jd.bp.smooth[[2]][jd.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["D"],lwd=2)
#upp.low.pts(smooth=jd.bp.smooth[[1]],jd.bp.smooth[[2]],chrom=chroms2plot[i],
# color=comp.col["D"],stat="D",pos.name="Pos")
# axes etc
axis(1,pos=-0.1,c(xmin,xmax),labels = FALSE)
axis(2,las=1,cex.axis=2,pos=xmin,at=c(-0.1,0.05,0.2))
legend(0,0.28,legend=c(expression("Jost's"~italic(D)),expression(delta~-divergence)),
bty='n',lwd=c(2,2),lty=c(1,1),xpd=TRUE,ncol=2,x.intersp = 0.2,text.width = 1000000,
cex = 2,col=c(comp.col["D"],comp.col["deltad"]))
# Empty plot for Fsts
plot(this.df$BP,this.df$Corrected.AMOVA.Fst, xlim=this.xlim,ylim=c(0,0.5),axes=F,ylab="",xlab="",type='n')
# Add the shared peak
points(y=c(0,0.5),x=c(pos,pos),type="l",col=alpha("#543005",0.75),cex=2,lwd=4)
rect(regmin,0,regmax,0.5,col=alpha("#543005",0.15),border=NA) #LD region
# Add the putative gene regions
rect(xleft=as.numeric(g$StartBP),xright=as.numeric(g$StopBP),
ybottom=0,ytop=0.5,col="indianred",border="indianred")
# Add Fsts
points(fwsw.fl$BP[fwsw.fl$Chr==this.chrom],fwsw.fl$Smoothed.AMOVA.Fst[fwsw.fl$Chr==this.chrom],
type="l",col=grp.colors[6],lwd=2)
points(fwsw.tx$BP[fwsw.tx$Chr==this.chrom],fwsw.tx$Smoothed.AMOVA.Fst[fwsw.tx$Chr==this.chrom],
type="l",col=grp.colors[1],lwd=2)
points(fwsw.la$BP[fwsw.la$Chr==this.chrom],fwsw.la$Smoothed.AMOVA.Fst[fwsw.la$Chr==this.chrom],
type="l",col=grp.colors[2],lwd=2)
points(fwsw.al$BP[fwsw.al$Chr==this.chrom],fwsw.al$Smoothed.AMOVA.Fst[fwsw.al$Chr==this.chrom],
type="l",col=grp.colors[2],lwd=2,lty=2)
# Add putative gene name
if(this.chrom=="LG8"){
text(x=txt.locs$starts[4:12],y=c(0.25,rep(0.35,7),0.25),
cex=2,labels=txt.locs$name[4:12],srt=90,xpd=T,font=2)
text(x=txt.locs$starts[1:3],y=c(0.43,0.3,0.15),cex=2,labels=txt.locs$name[1:3],xpd=T,font=2,pos=2)
}else{
text(x=txt.locs$starts,y=rep(0.25,length(txt.locs$starts)),
cex=2,labels=txt.locs$name,srt=90,xpd=T,font=2)
}
# axes etc
axis(1,pos=0,c(xmin,xmax),
labels = c(round((xmin/1000000),2),round((xmax/1000000),2)),cex.axis=2)
axis(2,las=1,cex.axis=2,pos=xmin,at=c(0,0.25,0.5))
legend(0,0.65,legend=c(expression("TX FWSW"~italic(F)[ST]),
expression("LA FWSW"~italic(F)[ST]),
expression("AL FWSW"~italic(F)[ST]),
expression("FL FWSW"~italic(F)[ST])),
bty='n',lwd=2,lty=c(1,1,2,1),ncol=4,xpd=TRUE,
cex = 2,col=c(grp.colors[1],grp.colors[2],grp.colors[2],grp.colors[6]),
x.intersp = 0.2)
# Add x-axis label
mtext(paste("Position on",this.chrom,"(Mb)"),1,cex=2*0.75,line=1)
dev.off()
## png
## 2
## png
## 2
Figure 6. An example of a region with putative balancing selection. The SNP designated by the dark brown bar was in the top 1% of two measures of diversity and in the lower 1% of two measures of divergence, suggesting that this SNP may be in a region experiencing balancing selection. The two measures of diversity are plotted in the top panel: nucleotide diversity, π, in dark teal and observed heterozygosity, H, in light teal. The two measures of divergence are in the middle panel: Jost’s D in dark brown and δ-divergence in light brown. The bottom panel shows the four pairwise freshwater-saltwater FST values for the sites in Texas (solid dark purple), Louisiana (solid light purple), Alabama (dashed light purple), and Florida (solid green). The red vertical lines indicate candidate freshwater adaptation genes and are annotated in the bottom panel. The grey rectangle indicates the dc region around the SNP (see text for details).
upp.pi<-NULL
low.pi<-NULL
upp.het<-NULL
low.het<-NULL
upp.dd<-NULL
low.dd<-NULL
for(i in 1:length(lgs)){
upp.pi<-rbind(upp.pi,avg.pi.adj[avg.pi.adj$Avg.Stat >=
quantile(avg.pi.adj$Avg.Stat[avg.pi.adj$Chr %in% lgs[i]],0.95) &
avg.pi.adj$Chr %in% lgs[i],])
low.pi<-rbind(low.pi,avg.pi.adj[avg.pi.adj$Avg.Stat <=
quantile(avg.pi.adj$Avg.Stat[avg.pi.adj$Chr %in% lgs[i]],0.05) &
avg.pi.adj$Chr %in% lgs[i],])
upp.het<-rbind(upp.het,avg.het.adj[avg.het.adj$Avg.Stat >=
quantile(avg.het.adj$Avg.Stat[avg.het.adj$Chr %in% lgs[i]],0.95) &
avg.het.adj$Chr %in% lgs[i],])
low.het<-rbind(low.het,avg.het.adj[avg.het.adj$Avg.Stat <=
quantile(avg.het.adj$Avg.Stat[avg.het.adj$Chr %in% lgs[i]],0.05) &
avg.het.adj$Chr %in% lgs[i],])
}
shared.low<-low.pi[low.pi$plot.pos %in% low.het$plot.pos,]
shared.low$plot.min<-shared.low$Avg.Pos-250000
shared.low$plot.max<-shared.low$Avg.Pos+250000
h.pi.name<-"lowdiv_subgenes.png"
row.settings<-c(4,3)
chroms2plot<-unique(shared.low$Chr)
shared.upp<-shared.low
fst.points<-FALSE
HandPiName<-expression(Low~pi~and~italic(H))
#colors
comp.col<-c(Het="#80cdc1",pi="#018571",Fst="black",D="#a6611a",deltad="#dfc27d")
#find the plotting location
nearest.pos<-function(stat.df,s.pos,s.stat,target,t.pos,t.stat){
near.pos<-t(apply(stat.df,1,function(df,target){
x<-as.numeric(df[s.pos])
pos<-target[which.min(abs(x-as.numeric(target[,t.pos]))),t.pos]
if(pos==x){
stat<-as.numeric(target[which.min(abs(x-as.numeric(target[,t.pos]))),t.stat])
}else{
if(pos>x){
upp<-which.min(abs(x-target[,t.pos]))
low<-upp-1
}else{
low<-which.min(abs(x-target[,t.pos]))
upp<-low+1
}
upp.pt<-target[upp,c(t.pos,t.stat)]
low.pt<-target[low,c(t.pos,t.stat)]
slope<-as.numeric((upp.pt[2]-low.pt[2])/(upp.pt[1]-low.pt[1]))
b<-as.numeric(upp.pt[2]-(slope*upp.pt[1]))
stat<-slope*x+b
}
return(cbind(x,stat))
},target=target))
return(near.pos)
}
upp.low.pts<-function(smooth,target,chrom,color,stat,pos.name,...){
#smooth== smooth[[1]]
#target== smooth[[2]]
upp<-smooth[smooth[,"Chrom"]%in% chrom & smooth$direction=="upper",]
low<-smooth[smooth[,"Chrom"]%in% chrom & smooth$direction=="lower",]
upp.pts<-nearest.pos(stat.df=upp,s.pos=pos.name,s.stat=stat,
target=target[target[,"chr"]%in% chrom,],
t.pos="pos",t.stat="smoothed.stats")
low.pts<-nearest.pos(stat.df=low,s.pos=pos.name,s.stat=stat,
target=target[target[,"chr"]%in% chrom,],
t.pos="pos",t.stat="smoothed.stats")
points(x=upp.pts[,1],y=upp.pts[,2],pch=24,bg=color,col=color,...)
points(x=low.pts[,1],y=low.pts[,2],pch=25,bg=color,col=color,...)
}
png(h.pi.name,height=8,width=14,units="in",res=300)
par(mfrow=row.settings,oma=c(1,1,1,1),mar=c(1,2,2,1))
for(i in 1:length(chroms2plot)){
this.df<-fwsw[fwsw$Chr %in% chroms2plot[i],]
this.xlim<-xlims[[as.character(chroms2plot[i])]]
plot(this.df$BP,this.df$Corrected.AMOVA.Fst, xlim=this.xlim,ylim=c(-0.2,0.5),axes=F,ylab="",xlab="",type='n')
xmin<-this.xlim[1]#min(pi.plot$Pos[pi.plot$Chrom%in%chroms2plot[i]])
xmax<-this.xlim[2]#max(pi.plot$Pos[pi.plot$Chrom%in%chroms2plot[i]])
#the shared peaks
p<-lapply(shared.upp$Pos[shared.upp$Chrom %in% chroms2plot[i]],
function(pos){
points(y=c(-0.2,0.5),x=c(pos,pos),
type="l",col=alpha("#543005",0.75),cex=2,lwd=4)
})
points(y=rep(c(-0.2,0.5),length(shared.upp$plot.pos[shared.upp$Chrom %in% chroms2plot[i]])),
x=c(shared.upp$plot.pos[shared.upp$Chrom %in% chroms2plot[i]],
shared.upp$plot.pos[shared.upp$Chrom %in% chroms2plot[i]]),
type="l",col=alpha("#543005",0.75),cex=2,lwd=4)
#putative gene regions
g<-genes2plot[genes2plot$Chrom %in% chroms2plot[i] &
genes2plot$StartBP >= xmin & genes2plot$StartBP <= xmax,]
a<-put.reg[put.reg$Chrom %in% chroms2plot[i] & !(put.reg$Gene %in% fav.genes),]
if(nrow(g) > 0){
rect(xleft=as.numeric(g$StartBP),xright=as.numeric(g$StopBP),
ybottom=-0.2,ytop=0.44,col="indianred",border="indianred")
}
#Fst
if(fst.points==TRUE){
points(this.df$BP,this.df$Corrected.AMOVA.Fst,pch=19,cex=1.5,
col=alpha(col=comp.col["Fst"],0.25),bg=alpha(col=comp.col["Fst"],0.25))
}
#Pi
points(pi.bp.smooth[[2]][pi.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",lwd=2,col=comp.col["pi"])
if(isTRUE(includeArrows)){
upp.low.pts(smooth=pi.bp.smooth[[1]],target=pi.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["pi"],stat="Pi",pos.name="Pos")
}
#Het
points(ht.bp.smooth[[2]][ht.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["Het"],lwd=2)
if(isTRUE(includeArrows)){
upp.low.pts(smooth=ht.bp.smooth[[1]],target=ht.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["Het"],stat="Het",pos.name="Pos")
}
#deltad
points(dd.bp.smooth[[2]][dd.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["deltad"],lwd=2)
if(isTRUE(includeArrows)){
upp.low.pts(smooth=dd.bp.smooth[[1]],dd.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["deltad"],stat="deltad",pos.name="Pos")
}
#Josts D
points(jd.bp.smooth[[2]][jd.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["D"],lwd=2)
if(isTRUE(includeArrows)){
upp.low.pts(smooth=jd.bp.smooth[[1]],jd.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["D"],stat="D",pos.name="Pos")
}
#shared Fst outliers
points(this.df$BP[this.df$BP %in% fw.sig.reg$BP],
rep(0.5,length(this.df$BP[this.df$BP %in% fw.sig.reg$BP])),
pch="|",cex=2,col="orchid4",lwd=3)
if(nrow(g)>0){
txt.locs<-data.frame(starts=unique(g$StartBP),name=g$Gene[!duplicated(g$StartBP)])
txt.locs<-txt.locs[order(txt.locs$starts),]
text(x=txt.locs$starts,y=0.35,cex=2,labels=txt.locs$name,srt=90,xpd=T)
}
#axes etc
axis(1,pos=-0.2,c(xmin,xmax),
labels = c(round((xmin/1000000),2),round((xmax/1000000),2)),cex.axis=2)
axis(2,las=1,hadj=0.75,cex.axis=2,pos=xmin,at=c(-0.25,0,0.25,0.5))
mtext(chroms2plot[i],1,cex=2*0.75,line=1)
}
#par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0),mar=c(0, 0, 0, 0), new=TRUE)
plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
legend("topleft",xpd=TRUE,
legend=c("Putative Gene",
expression(Shared~italic(F)[ST]~Outlier),
HandPiName),
bty='n',pch=c(15,124,15),#lwd=c(2,1,4,2,2,0,2,2),lty=c(1,0,1,1,1,0,1,1),
cex = 2,x.intersp = 0.5,y.intersp = 0.75,text.width=0.45,
col=c("indianred","orchid4",alpha("#543005",0.75)))
plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
legend("topleft",xpd=TRUE,
legend=c(expression(italic(H)),
expression(pi),#expression(FW-SW~italic(F)[ST]),
expression("Jost's"~italic(D)),
expression(delta~-divergence)),
bty='n',pch=c(15,15,15,15),#lwd=c(2,1,4,2,2,0,2,2),lty=c(1,0,1,1,1,0,1,1),
cex = 2,x.intersp = 0.5,y.intersp = 0.75,text.width=0.45,
col=c(comp.col[1:2],#alpha(col=comp.col["Fst"],0.25)
comp.col[4:5]))
dev.off()
## png
## 2
Low pi and low het
To test for selective sweeps, I’m using SweeD, which is a modified version of SweepFinder. Although it allows for multi-threading, I’ve found that I can highly parallelize the analysis by first calculating the genome-wide frequency spectrum from the whole vcf and then running each chromosome separately in the background on a supercomputer. So I need to make separate vcf files per chromosome, with a header line.
header<-"##fileformat=VCFv4.2"
for(i in 1:length(lgs)){
write.table(header,paste("sweed/",lgs[i],".vcf",sep=""),quote=FALSE,
col.names=FALSE,row.names=FALSE )
suppressWarnings(write.table(vcf[vcf$`#CHROM`==lgs[i],-708],paste("sweed/",lgs[i],".vcf",sep=""),
col.names=TRUE,row.names=FALSE,quote=FALSE,sep='\t',append = TRUE))
}
Then I can run SweeD on each file, specifying the genoem-wide frequency spectrum file and a grid size of 50 - which I’ve done. Now I can start to make sense of the results. SweeD produces output that shows the position, likelihood, and alpha value. The likelihood value is the one we’re primarily interested in, so I’ll plot those genome-wide.
sweed.files<-list.files(path="sweed",pattern="SweeD_Report",full.names=TRUE)
sweed<-do.call(rbind,lapply(sweed.files,function(file){
swlg<-read.delim(file,skip=2,header=TRUE)
if(length(grep("lg",file)==TRUE)>0){ #correct lowercase chrom names
swlg$Chrom<-paste("LG",gsub(".*lg(\\d+)$","\\1",file),sep="")
}else if(length(grep("LG",file)==TRUE)>0){
swlg$Chrom<-gsub(".*(LG\\d+)$","\\1",file)
}else{
swlg$Chrom<-file
print("Warning: error parsing chromosome name from filename")
}
return(swlg)
}))
#try this
sweed.plot<-fst.plot(sweed,fst.name = "Likelihood", bp.name = "Position",chrom.name = "Chrom",
scaffs.to.plot=plot.scaffs,scaffold.widths = bounds,pch=19,
pt.cex=1,axis.size=0)
axis(2,pos=-5000000,las=1)
labs<-tapply(sweed.plot$plot.pos,sweed.plot$Chrom,median)
text(x=labs[lgs],y=-.5,labels=lgn,xpd=TRUE)
abline(h=quantile(sweed.plot$Likelihood,0.99),lty=2,col="cornflowerblue",lwd=2)
There are 11 of 1100 estimated likelihoods by SweeD that exceed the upper 1% quantile. These are distributed on 5 chromosomes: 1, 3, 5, 1, 1
sweed.out<-sweed[sweed$Likelihood >= quantile(sweed$Likelihood,0.99),]
#need to add a snp for these to work
sweed.out$SNP<-paste(sweed.out$Chrom,sweed.out$Position,sep=".")
sweed.in<-outlier.in.region(sweed.out,put.reg,oBP = "Position",oChrom = "Chrom")
sweed.nb<-outlier.nearby(sweed.out,put.reg,chrom.ld,oBP = "Position",oChrom="Chrom")
0 genes contain SweeD outliers and 49 genes are within the LD distance of the SweeD estimators. However, this is likely because the SweeD likelihood estimates are fairly spread out. So let’s plot those five chromosomes.
chroms2plot<-unique(sweed.out$Chrom)
h.pi.name<-"sweed_perchrom.png"
row.settings<-c(3,2)
shared.upp<-sweed.out
fst.points<-FALSE
xlims<-lapply(chroms2plot,function(lg,genes,vcf){
xs<-c(min(vcf$POS[vcf$`#CHROM`==lg]),
max(vcf$POS[vcf$`#CHROM`==lg]))
return(xs)
},genes=shared.upp,vcf=vcf)
names(xlims)<-chroms2plot
#these are the genes nearby
fav.genes<-c("LEPR","EFNB1","TRIM14","ATP6V0A1","WNT11","angiotensin II receptor",
"EYA","CA","SGK3","FLT4","MSX2")
genes2plot<-put.reg[put.reg$Gene %in% fav.genes,]
genes2plot$Gene<-as.character(genes2plot$Gene)
genes2plot$Gene[genes2plot$Gene == "angiotensin II receptor"]<-"AGTR1"
## fix the locations of a couple for readability
#trim14
trim14.min<-min(put.reg$StartBP[put.reg$Gene=="TRIM14" & put.reg$Chrom=="LG10"])
trim14.max<-max(put.reg$StopBP[put.reg$Gene=="TRIM14" & put.reg$Chrom=="LG10"])
genes2plot$StartBP[put.reg$Chrom=="LG10"&put.reg$Gene=="TRIM14"]<-trim14.min
genes2plot$StopBP[put.reg$Chrom=="LG10"&put.reg$Gene=="TRIM14"]<-trim14.max
comp.col<-c(Het="#80cdc1",pi="#018571",Fst="black",D="#a6611a",deltad="#dfc27d")
HandPiName<-"SweeD outliers"
png("Sweed.png",height=6,width=10,res=300,units = "in")
nf <- layout(matrix(c(1,1,1,
2,3,4,
5,6,7), nrow=3, byrow=TRUE),widths=c(1,1,1),heights=c(0.75,1.25,1.25))
# genome-wide
par(oma=c(1,1,1,1),mar=c(1,2,0.5,1))
sweed.plot<-fst.plot(sweed,fst.name = "Likelihood", bp.name = "Position",chrom.name = "Chrom",
scaffs.to.plot=plot.scaffs,scaffold.widths = bounds,pch=19,
pt.cex=1,axis.size=0,y.lim=c(0,15))
axis(2,pos=-5000000,las=1,at=c(0,15),ylim=c(0,15),cex.axis=1.5)
mtext("CLR",2,line=1.5)
labs<-tapply(sweed$plot.pos,sweed$Chrom,median)
text(x=labs[lgs],y=-1.75,labels=lgn,xpd=TRUE,cex=1.5)
abline(h=quantile(sweed.plot$Likelihood,0.99),lty=2,col="cornflowerblue",lwd=2)
# add each chrom with outliers
for(i in 1:length(chroms2plot)){
this.df<-fwsw[fwsw$Chr %in% chroms2plot[i],]
this.xlim<-xlims[[as.character(chroms2plot[i])]]
plot(this.df$BP,this.df$Corrected.AMOVA.Fst, xlim=this.xlim,ylim=c(-0.2,0.5),axes=F,ylab="",xlab="",type='n')
xmin<-this.xlim[1]#min(pi.plot$Pos[pi.plot$Chrom%in%chroms2plot[i]])
xmax<-this.xlim[2]#max(pi.plot$Pos[pi.plot$Chrom%in%chroms2plot[i]])
#the shared peaks
p<-lapply(shared.upp$Pos[shared.upp$Chrom %in% chroms2plot[i]],
function(pos){
points(y=c(-0.2,0.5),x=c(pos,pos),
type="l",col=alpha("#f1b6da",0.75),cex=2,lwd=4)
})
points(y=rep(c(-0.2,0.5),length(shared.upp$plot.pos[shared.upp$Chrom %in% chroms2plot[i]])),
x=c(shared.upp$plot.pos[shared.upp$Chrom %in% chroms2plot[i]],
shared.upp$plot.pos[shared.upp$Chrom %in% chroms2plot[i]]),
type="l",col=alpha("#f1b6da",0.75),cex=2,lwd=4)
#putative gene regions
g<-genes2plot[genes2plot$Chrom %in% chroms2plot[i] &
genes2plot$StartBP >= xmin & genes2plot$StartBP <= xmax,]
a<-put.reg[put.reg$Chrom %in% chroms2plot[i] & !(put.reg$Gene %in% fav.genes),]
if(nrow(g) > 0){
rect(xleft=as.numeric(g$StartBP),xright=as.numeric(g$StopBP),
ybottom=-0.2,ytop=0.44,col="indianred",border="indianred")
}
#Fst
if(fst.points==TRUE){
points(this.df$BP,this.df$Corrected.AMOVA.Fst,pch=19,cex=1.5,
col=alpha(col=comp.col["Fst"],0.25),bg=alpha(col=comp.col["Fst"],0.25))
}
#Pi
points(pi.bp.smooth[[2]][pi.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",lwd=2,col=comp.col["pi"])
upp.low.pts(smooth=pi.bp.smooth[[1]],target=pi.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["pi"],stat="Pi",pos.name="Pos")
#Het
points(ht.bp.smooth[[2]][ht.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["Het"],lwd=2)
upp.low.pts(smooth=ht.bp.smooth[[1]],target=ht.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["Het"],stat="Het",pos.name="Pos")
#deltad
points(dd.bp.smooth[[2]][dd.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["deltad"],lwd=2)
upp.low.pts(smooth=dd.bp.smooth[[1]],dd.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["deltad"],stat="deltad",pos.name="Pos")
#Josts D
points(jd.bp.smooth[[2]][jd.bp.smooth[[2]][,"chr"]%in% chroms2plot[i],c("pos","smoothed.stats")],
type="l",col=comp.col["D"],lwd=2)
upp.low.pts(smooth=jd.bp.smooth[[1]],jd.bp.smooth[[2]],chrom=chroms2plot[i],
color=comp.col["D"],stat="D",pos.name="Pos")
#shared Fst outliers
points(this.df$BP[this.df$BP %in% fw.sig.reg$BP],
this.df$Corrected.AMOVA.Fst[this.df$BP %in% fw.sig.reg$BP],
pch=8,cex=2,col="orchid4",lwd=3)
#add text for putative genes
if(i == 1){
txt.locs<-data.frame(starts=unique(g$StartBP),name=g$Gene[!duplicated(g$StartBP)])
txt.locs<-txt.locs[order(txt.locs$starts),]
txt.locs[1,"starts"]<-txt.locs[1,"starts"]-500000
txt.locs[2,"starts"]<-txt.locs[2,"starts"]+500000
text(x=txt.locs$starts,y=0.35,cex=1.5,labels=txt.locs$name,srt=90,xpd=T)
}else if(i==2){
txt.locs<-data.frame(starts=unique(g$StartBP),name=g$Gene[!duplicated(g$StartBP)])
txt.locs<-txt.locs[order(txt.locs$starts),]
txt.locs[3,"starts"]<-mean(txt.locs[3:5,"starts"])
txt.locs<-txt.locs[1:3,]
text(x=txt.locs$starts,y=0.35,cex=1.5,labels=txt.locs$name,srt=90,xpd=T)
}else if(i==4){
txt.locs<-data.frame(starts=unique(g$StartBP),name=g$Gene[!duplicated(g$StartBP)])
txt.locs<-txt.locs[order(txt.locs$starts),]
txt.locs[1,"starts"]<-txt.locs[1,"starts"]+75000
text(x=txt.locs$starts[-4],y=0.35,cex=1.5,labels=txt.locs$name[-4],srt=90,xpd=T)
text(x=txt.locs$starts[4],y=0,cex=1.5,labels=txt.locs$name[4],srt=90,xpd=T)
}else{
txt.locs<-data.frame(starts=unique(g$StartBP),name=g$Gene[!duplicated(g$StartBP)])
txt.locs<-txt.locs[order(txt.locs$starts),]
#txt.locs<-txt.locs[-4,]
text(x=txt.locs$starts,y=0.35,cex=1.5,labels=txt.locs$name,srt=90,xpd=T)
}
#axes etc
axis(1,pos=-0.2,c(xmin,xmax),
labels = c(round((xmin/1000000),2),round((xmax/1000000),2)),cex.axis=1.5)
axis(2,las=1,hadj=0.75,cex.axis=1.5,pos=xmin,at=c(-0.25,0,0.25,0.5))
mtext(chroms2plot[i],1,cex=1.5*0.75,line=1)
}
# legend
plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
legend("topleft",xpd=TRUE,
legend=c("Putative Gene",
expression(Shared~italic(F)[ST]~Outlier),
HandPiName, expression(italic(H)),
expression(pi),
expression("Jost's"~italic(D)),
expression(delta~-divergence)),
bty='n',pch=c(15,8,15,15,15,15,15),#lwd=c(2,1,4,2,2,0,2,2),lty=c(1,0,1,1,1,0,1,1),
pt.cex = 2,cex=1.5,x.intersp = 0.5,y.intersp = 0.75,text.width=0.45,
col=c("indianred","orchid4",alpha("#f1b6da",0.75),comp.col[1:2],comp.col[4:5]))
dev.off()
SweeD Plot
I’m highlighting a few of the putative genes that have a bunch of outliers nearby or in them. First is TNS1, which matched three genome annotations but only had one region, on LG1, that contained shared outlier Fsts, delta divergence, Bayenv salinity, were near monophyletic neighbor joining trees.
fst.dfs<-list(fwsw.tx,fwsw.la,fwsw.al,fwsw.fl)
names(fst.dfs)<-c("TX FWSW","LA FWSW","AL FWSW","FL FWSW")
colors=c(grp.colors[1],grp.colors[2],grp.colors[3],grp.colors[6])
alphas=c(0.5,0.5,1,0.5)
ltys=c(1,1,2,1)
The plotting function gene.region.plot accpets a number of parameters.
gene.region.plot<-function(chrom,gene,put.reg,vcf,chrom.ld, fst.dfs,deltad=FALSE,D=FALSE,
colors="black",lwds=2,alphas=0.5,ltys=1,legend=TRUE,bases="bp",
smooth=FALSE, smooth.loess=TRUE,fst.name="Corrected.AMOVA.FST",txt.cex=1,y.lim=c(0,1),...){
pstart<-min(as.numeric(as.character(put.reg[put.reg$Gene ==gene &
put.reg$Chrom==chrom,"StartBP"])))-(chrom.ld[chrom]/2)
pstop<-max(as.numeric(as.character(put.reg[put.reg$Gene ==gene &
put.reg$Chrom==chrom,"StartBP"])))+(chrom.ld[chrom]/2)
if(pstart < 0){ pstart<- 0 }
if(pstop > max(vcf[vcf$`#CHROM`==chrom,"POS"])){ pstop <- max(vcf[vcf$`#CHROM`==chrom,"POS"]) }
starts<-as.numeric(put.reg[put.reg$Chrom %in% chrom & put.reg$Gene == gene,"StartBP"])
stops<-as.numeric(put.reg[put.reg$Chrom %in% chrom & put.reg$Gene == gene,"StopBP"])
if(smooth==TRUE){
#generate the dataframes
smooth.fsts<-lapply(fst.dfs,function(df){
this.df<-df[df$Chr %in% chrom,]
this.smooth<-do.call("rbind",lapply(seq(1,nrow(this.df),(nrow(this.df)*0.15)/5),sliding.avg,
dat=data.frame(Pos=this.df$BP,Fst=this.df[,fst.name]),
width=nrow(this.df)*0.15))
return(this.smooth)
})
} else{
smooth.fsts<-lapply(fst.dfs,function(df){
this.smooth<-df[df$Chr %in% chrom,c("BP",fst.name)]
})
}
if(is.data.frame(deltad)){
this.dd<-deltad[deltad$Chrom %in% chrom,]
if(smooth.loess==TRUE){
dd.smooth<-loess.smooth(this.dd$Pos,this.dd$deltad,span=0.1,degree=2)
names(dd.smooth)<-c("pos","smoothed.stats")
}else{
dd.smooth<-this.dd
}
}else if(is.list(deltad)){
dd.smooth<-deltad[[2]][deltad[[2]][,"chr"]%in%chrom,]
}else{
dd.smooth<-NULL
}
if(is.data.frame(D)){
this.d<-D[D$Chr %in% chrom,]
if(smooth.loess==TRUE) {
dsmooth<-loess.smooth(this.d$POS,this.d$D,span=0.1,degree=2)
names(dd.smooth)<-c("pos","smoothed.stats")
}else{
dsmooth<-this.d
}
}else if(is.list(D)){
dsmooth<-D[[2]][D[[2]][,"chr"]%in%chrom,]
}else{
D<-NULL
}
pr.gene<-put.reg[put.reg$Chrom==chrom & put.reg$Gene==gene,]
fst.gene<- unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$rad.loci[pr.gene$rad.loci != "NA"]),","))
sal.gene<- unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$bfs.in[pr.gene$bfs.in != "NA"]),","))
# tmp.gene<- unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$bft.in[pr.gene$bft.in != "NA"]),","))
#grs.gene<- unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$bfg.in[pr.gene$bfg.in != "NA"]),","))
#jd.gene<-unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$jostd.nb[jostd.nb != "NA"]),","))
sdd.gene<- unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$sdd.in[pr.gene$sdd.in != "NA"]),","))
#nj.gene<-as.numeric(unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$nj.nb[pr.gene$nj.nb != "NA"]),",")))
nj.gene<-NULL
nb.gene<-as.numeric(unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$nearby.rad[pr.gene$nearby.rad != "NA"]),",")))
gene.out<-data.frame(BP=c(as.numeric(fst.gene),as.numeric(sal.gene),as.numeric(sdd.gene)),
shape=c(rep(8,length(fst.gene)),rep(1,length(sal.gene)),rep(4,length(sdd.gene))))
if(length(colors) != length(smooth.fsts)){
colors<-rep(colors,length(smooth.fsts)/length(colors))
}
if(length(lwds) != length(smooth.fsts)){
lwds<-rep(lwds,length(smooth.fsts)/length(lwds))
}
if(length(alphas) != length(smooth.fsts)){
alphas<-rep(alphas,length(smooth.fsts)/length(alphas))
}
if(length(ltys) != length(smooth.fsts)){
ltys<-rep(ltys,length(smooth.fsts)/length(ltys))
}
plot(smooth.fsts[[1]],type='n',ylim=c(y.lim[1],y.lim[2]+0.02),
bty="L",xlab="",ylab="",xaxt='n',yaxt='n',xlim=c(pstart,pstop))
axis(2,las=1,cex.axis=txt.cex)
xlabs<-c(pstart,pstop)
if(bases %in% c("mb","MB","Mb")) xlabs<-c(round((pstart/1000000),2),round((pstop/1000000),2))
if(bases %in% c("kb","KB","Kb")) xlabs<-c(round((pstart/1000),2),round((pstop/1000),2))
axis(1,at=c(pstart,pstop),cex.axis=txt.cex,labels = xlabs)
#add putative gene
rect(xleft=as.numeric(starts),xright=as.numeric(stops),
ybottom=y.lim[1]-0.04,ytop=y.lim[2],col="indianred",border="indianred")
#add fsts
mtext(chrom,1,cex=0.75*txt.cex,line = 1)
for(i in 1:length(smooth.fsts)){
points(smooth.fsts[[i]],col=alpha(colors[i],alphas[i]),type="l",lwd=lwds[i],lty=ltys[i])
}
#add delta divergence
if(is.data.frame(dd.smooth)){
points(dd.smooth$pos,dd.smooth$smoothed.stats,col="#dfc27d",type="l",lwd=2)
}
if(is.list(deltad)){
upp.low.pts(smooth=deltad[[1]],target=deltad[[2]],chrom=chrom,color=comp.col["deltad"],stat="deltad",pos.name="Pos")
}
#add Jost's D
if(is.data.frame(dsmooth)){
points(dsmooth$pos,dsmooth$smoothed.stats,type="l",col="#a6611a",lwd=2)
}
if(is.list(D)){
upp.low.pts(smooth=D[[1]],target=D[[2]],chrom=chrom,color=comp.col["D"],stat="D",pos.name="Pos")
}
#are nj trees shown in scope of fig?
njs.eval<-lapply(nj.gene,function(x){
if(x >= pstart & x <= pstop) { eval = TRUE }
else { eval = FALSE }
return(eval)
})
#add delta d
if(is.data.frame(deltad)){
lgnd<-unlist(lapply(names(fst.dfs),function(n) { bquote(.(n)~italic(F[ST])) }))
lgnd<-c(lgnd,expression(delta~-divergence))
}else{
lgnd<-unlist(lapply(names(fst.dfs),function(n) { substitute(paste(name,italic(F[ST]),sep=""),list(name=n)) }))
}
cols<-NULL
pchs<-rep(32,length(lwds))
for(i in 1:length(colors)){
cols<-c(cols,alpha(colors[i],alphas[i]))
}
if(!is.null(deltad)){
cols<-c(cols,"#dfc27d")
pchs<-c(pchs,32)
ltys<-c(ltys,1)
lwds<-c(lwds,2)
}
if(!is.null(D)){
cols<-c(cols,"#a6611a")
pchs<-c(pchs,32)
lgnd<-c(lgnd,"Jost's D")
ltys<-c(ltys,1)
lwds<-c(lwds,2)
}
lgnd<-c(lgnd,substitute(italic(g),list(g=gene)),"Outlier RAD loci")
cols<-c(cols,"indianred","black")
pchs<-c(pchs,15,124)
if(length(grep(TRUE,njs.eval))>0){
lgnd<-c(lgnd,"Monophyletic Gene Tree")
cols<-c(cols,"#08519c")
pchs<-c(pchs,124)
}
if(length(sal.gene)>0){
lgnd<-c(lgnd,"Salinity-associated")
cols<-c(cols,"black")
pchs<-c(pchs,25)
}
if(legend==TRUE){
legend("topleft",
legend=lgnd,pch=pchs,
bty='n',lwd=c(lwds,0,0,0,0),lty=c(ltys,0,0,0,0),
col=cols,cex = txt.cex)
}
#add outlier loci
clip(x1=min(as.numeric(pstart)),x2=max(as.numeric(pstop)),y1=y.lim[2]+.01,y2=y.lim[2]+0.2)
abline(v=fst.gene,col="black")
points(x=sal.gene,y=rep(y.lim[2]+0.05,length(sal.gene)),col="black",pch=25,cex=0.75*txt.cex,lwd=2)
abline(v=nb.gene,col=alpha("black",0.5),cex=2)
abline(v=nj.gene,col="#08519c",cex=2)
}
outside.legend<-function(...){
opar <- par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0),
mar=c(0, 0, 0, 0), new=TRUE)
on.exit(par(opar))
plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
legend(...)
}
#set up legend parameters
leg.text<-c(expression(TX~FWSW~italic(F)[ST]),expression(LA~FWSW~italic(F)[ST]),
expression(AL~FWSW~italic(F)[ST]),expression(FL~FWSW~italic(F)[ST]),
expression(delta~-divergence),expression("Jost's"~italic(D)),
expression(Shared~italic(F)[ST]~Outliers),
"Salinity-associated SNPs")
leg.pchs<-c(rep(32,6),124,25,124)
leg.ltys<-c(1,1,2,1,1,1,0,0,0)
leg.cols<-c(colors,comp.col["deltad"],comp.col["D"],"black","black","indianred")
png("ARGHEF3.png",height=8,width=10,units="in",res=300)
par(mfrow=c(5,3),oma=c(2,2,3,2),mar=c(2,2,1,1))
arhgef3<-lapply(unique(put.reg[put.reg$Gene=="ARHGEF3" & put.reg$Chrom%in%lgs,"Chrom"]),# & !is.na(put.reg$rad.loci)
gene.region.plot,gene="ARHGEF3",put.reg=put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
outside.legend("top",legend=c(leg.text,expression(italic(ARHGEF3))),pch=leg.pchs,lty=leg.ltys,lwd=2,
bty='n',ncol=3,cex=1.25,x.intersp=0,col=leg.cols)
dev.off()
png("mucin.png",height=8,width=10,units="in",res=300)
par(mfrow=c(5,3),oma=c(2,2,3,2),mar=c(2,2,1,1))
mucin<-lapply(unique(put.reg[put.reg$Gene=="mucin"& put.reg$Chrom%in%lgs,"Chrom"]),# & !is.na(put.reg$rad.loci)
gene.region.plot,gene="mucin",put.reg=put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
outside.legend("top",legend=c(leg.text,expression(italic(mucin))),pch=leg.pchs,lty=leg.ltys,lwd=2,
bty='n',ncol=3,cex=1.25,x.intersp=0,col=leg.cols)
dev.off()
png("NHE.png",height=4,width=6,units="in",res=300)
par(mfrow=c(2,4),oma=c(2,2,3,2),mar=c(2,2,2,2))
nhe<-lapply(unique(put.reg[put.reg$Gene=="NHE" & put.reg$Chrom%in%lgs,"Chrom"]),# & !is.na(put.reg$rad.loci)
gene.region.plot,gene="NHE",put.reg=put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
outside.legend("top",legend=c(leg.text,expression(italic(NHE))),pch=leg.pchs,lty=leg.ltys,lwd=2,
bty='n',ncol=3,cex=1.25,x.intersp=0,col=leg.cols)
dev.off()
png("TAAR.png",height=4,width=6,units="in",res=300)
par(mfrow=c(1,2),oma=c(2,2,3,2),mar=c(2,2,2,2))
taar<-lapply(unique(put.reg[put.reg$Gene=="TAAR" & put.reg$Chrom%in%lgs,"Chrom"]), #& !is.na(put.reg$rad.loci)
gene.region.plot,gene="TAAR",put.reg=put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
outside.legend("top",legend=c(leg.text,expression(italic(TAAR))),pch=leg.pchs,lty=leg.ltys,lwd=2,
bty='n',ncol=3,cex=.75,x.intersp=0,col=leg.cols)
dev.off()
Supplmental Fig. 5
Supplemental Fig. 6
Supplemental Fig. 7
Supplemental Fig. 8
colors=c(grp.colors[1],grp.colors[2],grp.colors[3],grp.colors[6])
alphas=c(0.5,0.5,1,0.5)
ltys=c(1,1,2,1)
png("Fig7candidateGenes.png",height=10,width=10,units="in",res=300)
par(mfrow=c(4,3),oma=c(2,2,4.5,2),mar=c(1,3,2,1))
#row1
gene.region.plot("LG1","NHE",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=4500000,y=0.5,expression(italic(NHE)),xpd=T,cex=2)
gene.region.plot("LG5","NHE",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=4500000,y=0.5,expression(italic(NHE)),xpd=T,cex=2)
gene.region.plot("LG11","NHE",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=3000000,y=0.5,expression(italic(NHE)),xpd=T,cex=2)
#row2
gene.region.plot("LG2","mucin",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=4500000,y=0.5,expression(italic(mucin)),xpd=T,cex=2)
gene.region.plot("LG4","TAAR",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=15000000,y=0.5,expression(italic(TAAR)),xpd=T,cex=2)
gene.region.plot("LG7","TAAR",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=3000000,y=0.5,expression(italic(TAAR)),xpd=T,cex=2)
#row3
gene.region.plot("LG7","ARHGEF3",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=8500000,y=0.5,expression(italic(ARHGEF3)),xpd=T,cex=2)
gene.region.plot("LG13","ARHGEF3",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=2200000,y=0.5,expression(italic(ARHGEF3)),xpd=T,cex=2)
gene.region.plot("LG14","ARHGEF3",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=6500000,y=0.5,expression(italic(ARHGEF3)),xpd=T,cex=2)
#row4
gene.region.plot("LG18","ARHGEF3",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=5500000,y=0.5,expression(italic(ARHGEF3)),xpd=T,cex=2)
gene.region.plot("LG19","ARHGEF3",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=4000000,y=0.5,expression(italic(ARHGEF3)),xpd=T,cex=2)
gene.region.plot("LG20","ARHGEF3",put.reg,vcf=vcf,chrom.ld=chrom.ld,
fst.dfs=fst.dfs,deltad=dd.bp.smooth,D=jd.bp.smooth,colors=colors,alphas=alphas,ltys=ltys,
smooth=FALSE,smooth.loess=FALSE,fst.name="Smoothed.AMOVA.Fst",bases="MB",legend=FALSE)
text(x=4000000,y=0.5,expression(italic(ARHGEF3)),xpd=T,cex=2)
outside.legend("top",legend=c(expression(TX~FWSW~italic(F)[ST]),expression(LA~FWSW~italic(F)[ST]),
expression(AL~FWSW~italic(F)[ST]),expression(FL~FWSW~italic(F)[ST]),
expression(delta~-divergence),expression("Jost's"~italic(D)),
expression(Shared~italic(F)[ST]~Outliers),
"Salinity-associated SNPs","FW gene"),
pch=c(rep(32,6),124,25,124),lty=c(1,1,2,1,1,1,0,0,0),lwd=2,
bty='n',ncol=5,cex=1.25,x.intersp=0,
col=c(colors,comp.col["deltad"],comp.col["D"],"black","black","indianred"))
#
# par(fig = c(0, 1, 0, 1),oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
# plot(0:1.5, 0:1.5, type="n", xlab="", ylab="", axes=FALSE)
# legend(x=0.5,y=0.22,c(expression(Shared~italic(F)[ST]~Outliers),
# "Salinity-associated SNPs",
# "FW Candidate Gene"),
# pch=c(124,25,124),lty=c(0,0,0),lwd=2,
# col=c("black","black","indianred"),
# bty='n',ncol=1,cex=2,x.intersp=-0.5,xpd=T)
#
#
# #add lines to the top
# par(fig = c(0, 1, 0, 1),oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
# plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
# legend("top",c(expression(TX~FWSW~italic(F)[ST]),
# expression(LA~FWSW~italic(F)[ST]),
# expression(AL~FWSW~italic(F)[ST]),
# expression(FL~FWSW~italic(F)[ST]),
# expression(delta~-divergence)),
# pch=rep(32,5),lty=c(1,1,1,1,1),lwd=2,text.width=0.16,
# col=c(grp.colors[1],grp.colors[2],grp.colors[3],grp.colors[6],
# "#dfc27d"),
# bty='n',ncol=3,cex=2,x.intersp=0.5,y.intersp=0.75,xpd=T)
dev.off()
Fig. 7
LG4 clearly is enriched for outliers, let’s look at it more closely, including putative FW genes.
#define a few things
chrom<-"LG4"
pchs<-c(rep(32,6),15,124,124)
cols<-c(grp.colors[1],grp.colors[2],grp.colors[3],grp.colors[6],
"#dfc27d","#a6611a","indianred","black","#08519c")
comp.col<-c(Het="#80cdc1",pi="#018571",Fst="black",D="#a6611a",deltad="#dfc27d")
ltys=c(1,1,2,1,1,1,0,0,0)
lwds<-c(rep(2,6),0,0,0)
#put together the data for LG4
#this.dd<-deltad[deltad$Chrom %in% chrom,]
#dd.smooth<-loess.smooth(this.dd$Pos,this.dd$deltad,span=0.1,degree=2)
#add Jost's D
#this.d<-jostd[jostd$Chr %in% chrom,]
#dsmooth<-loess.smooth(this.d$POS,this.d$D,span=0.1,degree=2)
#get the outliers
pr.gene<-put.reg[put.reg$Chrom==chrom ,]
#rad loci
fst.gene<- stacks.sig[stacks.sig$Chr==chrom,"BP"]
#none of the salinity-associated RAD loci are in or near genes on LG8, but we could plot them all anyway
sal.gene<-sal.bf.sig[sal.bf.sig$scaffold=="LG8","BP"]
#jost's d - none are in the putative genes
jd.gene<-unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$jost.in[jost.in != "NA"]),","))
#delta divergence outliers- none are in the putative genes
sdd.gene<- unlist(strsplit(gsub(paste(chrom,".",sep=""),"",pr.gene$sdd.in[pr.gene$sdd.in != "NA"]),","))
#put all of them together with their associated shapes
gene.out<-data.frame(BP=c(as.numeric(fst.gene),as.numeric(sal.gene),as.numeric(sdd.gene)),
shape=c(rep(8,length(fst.gene)),rep(1,length(sal.gene)),rep(4,length(sdd.gene))))
#add putative genes
g<-put.reg[put.reg$Chrom %in% chrom,]
g$Gene<-as.character(g$Gene)
g$Gene[grep("ATP6",g$Gene)]<-"ATP6V" #standardize the names
g<-g[,c("Gene","StartBP","StopBP")]
g<-g[!duplicated(g$StartBP),]
g<-g[order(g$StartBP),]
## Model 1
source("../programs/dmc-master/getMCLE.R")
dmc.pattern<-"p4LG4_100000_2_9_forReal"
positions<-readRDS("dmc/selectedRegionPositions_p4LG4.RDS")
selSite = positions[seq(1, length(positions[positions<10000000]), length.out = 100)]
sels = c(1e-4, 1e-3, 0.01, seq(0.02, 0.14, by = 0.01), seq(0.15, 0.3, by = 0.05),
seq(0.4, 0.6, by = 0.1))
times = c(0, 5, 25, 50, 100, 500, 1000, 1e4, 1e6)
migs = c(10^-(seq(5, 1, by = -2)), 0.5, 1)
Ne<-100000
rec=2*10^-9
sources<-c(3,5,7,16)
gs<-c(1/(2*Ne), 10^-(4:1))
compLikelihood_ind<-readRDS(paste("dmc/compLikelihood_ind_",dmc.pattern,".RDS",sep=""))
mcle_ind = getMCLEind(compLikelihood_ind, selSite, sels)
mcle_ind_index<-which(mcle_ind[1]==selSite)
profileLike_sel_ind = sapply(1: length(sels), function(i) {
max(unlist(compLikelihood_ind[[mcle_ind_index]][[i]]))
})
## Model 3
compLikelihood_sv<-readRDS(paste("dmc/compLikelihood_sv_",dmc.pattern,".RDS",sep=""))
mcle_sv<-getMCLEsv_source(compLikelihood_sv, selSite, sels, gs, times, sources)
mcle_sv_index<-which(mcle_sv[1]==selSite)
profileLike_sel_sv = sapply(1: length(sels), function(i) {
max(unlist(compLikelihood_sv[[mcle_sv_index]][[i]]))
})
## Model 5
compLikelihood_mixed_svInd<-readRDS(paste("dmc/compLikelihood_mixed_svInd_",dmc.pattern,".RDS",sep=""))
mcle_mixed_svInd<-getMCLEmixed(compLikelihood_mixed_svInd, selSite, sels, gs, times, migs[1], sources)
mcle_mixed_svInd_index<-which(mcle_mixed_svInd[1]==selSite)
profileLike_sel_mixed_svInd = sapply(1: length(sels), function(i) {
max(unlist(compLikelihood_mixed_svInd[[mcle_mixed_svInd_index]][[i]]))
})
#functions for plotting dmc output
calc.max.complike<-function(pattern,neutral_name=NA){
#read in composite likelihood files and calculate max for all proposed selected sites
if(is.na(neutral_name)){
compLikelihood_neutral = readRDS(paste("dmc/compLikelihood_neutral_",dmc.pattern,".RDS",sep=""))
}else{
compLikelihood_neutral = readRDS(neutral_name)
}
compLikelihood_neutral_site = sapply(1 : length(compLikelihood_neutral), function(i) {
max(unlist(compLikelihood_neutral[[i]]))
})
compLikelihood_ind = readRDS(paste("dmc/compLikelihood_ind_",pattern,".RDS",sep=""))
compLikelihood_ind_site = sapply(1 : length(compLikelihood_ind), function(i) {
max(unlist(compLikelihood_ind[[i]]))
})
compLikelihood_mig = readRDS(paste("dmc/compLikelihood_mig_",pattern,".RDS",sep=""))
compLikelihood_mig_site = sapply(1 : length(compLikelihood_mig), function(i) {
max(unlist(compLikelihood_mig[[i]]))
})
compLikelihood_sv = readRDS(paste("dmc/compLikelihood_sv_",pattern,".RDS",sep=""))
compLikelihood_sv_site = sapply(1 : length( compLikelihood_sv), function(i) {
max(unlist(compLikelihood_sv[[i]]))
})
compLikelihood_mixed_migInd = readRDS(paste("dmc/compLikelihood_mixed_migInd_",pattern,".RDS",sep=""))
compLikelihood_mixed_migInd_site = sapply(1 : length(compLikelihood_mixed_migInd), function(i) {
max(unlist(compLikelihood_mixed_migInd[[i]]))
})
compLikelihood_mixed_svInd = readRDS(paste("dmc/compLikelihood_mixed_svInd_",pattern,".RDS",sep=""))
compLikelihood_mixed_svInd_site = sapply(1 : length(compLikelihood_mixed_svInd), function(i) {
max(unlist(compLikelihood_mixed_svInd[[i]]))
})
return(list(max.likes=c(ind=(compLikelihood_ind_site - compLikelihood_neutral_site),
mig=(compLikelihood_mig_site - compLikelihood_neutral_site),
sv=(compLikelihood_sv_site - compLikelihood_neutral_site),
migInd=(compLikelihood_mixed_migInd_site - compLikelihood_neutral_site),
svInd=(compLikelihood_mixed_svInd_site - compLikelihood_neutral_site)),
max.complikes=c(neutral=compLikelihood_neutral,ind=compLikelihood_ind,
mig=compLikelihood_mig,
sv=compLikelihood_sv,
migInd=compLikelihood_mixed_migInd,
svInd=compLikelihood_mixed_svInd)))
}
plot.complike<-function(pattern,leg=TRUE,lab=TRUE,plot_range = NA,selSite){
}
png("LG4.png",width=10,height=7,units="in",res=300)
nf <- layout(matrix(c(1,1,1,1,
2,2,2,2,
3,3,3,3), nrow=3, byrow=TRUE))
### Plot all of LG4
par(oma=c(2.5,5,2,1),mar=c(1.5,1,2,1))
plot(fst.dfs[[1]]$BP[fst.dfs[[1]]$Chr %in% chrom],
fst.dfs[[1]]$Smoothed.AMOVA.Fst[fst.dfs[[1]]$Chr %in% chrom],
type='n',ylim=c(0,1),bty="L",xlab="",ylab="",xaxt='n',yaxt='n')
axis(2,las=1,cex.axis=1.75)
axis(1,cex.axis=1.75,at=seq(0,1.8*10^7,3*10^6),labels = seq(0,18,3))
#add fsts
#mtext(chrom,1,cex=2*0.75,line=2.5)
for(i in 1:length(fst.dfs)){
points(fst.dfs[[i]]$BP[fst.dfs[[i]]$Chr %in% chrom],
fst.dfs[[i]]$Smoothed.AMOVA.Fst[fst.dfs[[i]]$Chr %in% chrom],
col=alpha(colors[i],alphas[i]),type="l",lwd=lwds[i],lty=ltys[i])
}
#add delta-d and Jost's D
#Pi
# points(pi.bp.smooth[[2]][pi.bp.smooth[[2]][,"chr"]%in% chrom,c("pos","smoothed.stats")],
# type="l",lwd=2,col=comp.col["pi"])
# upp.low.pts(smooth=pi.bp.smooth[[1]],target=pi.bp.smooth[[2]],
# chrom=chrom,color=comp.col["pi"],stat="Pi",pos.name="Pos",cex=1.75)
#Het
# points(ht.bp.smooth[[2]][ht.bp.smooth[[2]][,"chr"]%in% chrom,c("pos","smoothed.stats")],
# type="l",col=comp.col["Het"],lwd=2)
# upp.low.pts(smooth=ht.bp.smooth[[1]],target=ht.bp.smooth[[2]],
# chrom=chrom,color=comp.col["Het"],stat="Het",pos.name="Pos",cex=1.75)
#deltad
points(dd.bp.smooth[[2]][dd.bp.smooth[[2]][,"chr"]%in% chrom,c("pos","smoothed.stats")],
type="l",col=comp.col["deltad"],lwd=2)
upp.low.pts(smooth=dd.bp.smooth[[1]],target=dd.bp.smooth[[2]],
chrom=chrom,color=comp.col["deltad"],stat="deltad",pos.name="Pos",cex=1.75)
#Josts D
points(jd.bp.smooth[[2]][jd.bp.smooth[[2]][,"chr"]%in% chrom,c("pos","smoothed.stats")],
type="l",col=comp.col["D"],lwd=2)
upp.low.pts(smooth=jd.bp.smooth[[1]],target=jd.bp.smooth[[2]],
chrom=chrom,color=comp.col["D"],stat="D",pos.name="Pos",cex=1.75)
#add the putative genes
starts<-as.numeric(g[,"StartBP"])
stops<-as.numeric(g[,"StopBP"])
rect(xleft=as.numeric(starts),xright=as.numeric(stops),
ybottom=-0.04,ytop=1,col="indianred",border="indianred")
#add the ones that are spaced out
text(x=g$StartBP[!g$Gene %in% c("SCUBE1","TRIM14")],y=0.5,
labels=g$Gene[!g$Gene %in% c("SCUBE1","TRIM14")],font=2,srt=90,xpd=T,cex=1.75)
text(x=g$StartBP[g$Gene == "SCUBE1"]-50000,y=0.8,
labels=g$Gene[g$Gene =="SCUBE1"],font=2,srt=90,xpd=T,cex=1.75)
text(x=g$StartBP[g$Gene == "TRIM14"]+50000,y=0.2,
labels=g$Gene[g$Gene =="TRIM14"],font=2,srt=90,xpd=T,cex=1.75)
#add outlier loci
clip(x1=min(as.numeric(fst.dfs[[1]]$BP[fst.dfs[[1]]$Chr %in% chrom])),
x2=max(as.numeric(fst.dfs[[1]]$BP[fst.dfs[[1]]$Chr %in% chrom])),y1=0.99,y2=1.06)
abline(v=fst.gene,col="black",lwd=1.5)
points(x=sal.gene,y=rep(1.05,length(sal.gene)),col="black",pch=25,cex=1)
### Plot the recombination rate
load("../../scovelli_genome/ssc_map.rda")
rec.cols<-c("#fdbb84","#fc8d59","#ef6548")
plot(set[["LG4"]]@interpolations$slidingwindow@physicalPositions,
set[["LG4"]]@interpolations$slidingwindow@rates,type="l",bty="L",
ylim=c(-5,25),xlab="",ylab="",xaxt='n',yaxt='n',lwd=2,col=rec.cols[1])
abline(h=0,lty=2)
axis(2,cex.axis=1.75,las=1)
axis(1,at=seq(0,1.8*10^7,3*10^6),labels = seq(0,18,3),cex.axis=1.75)
mtext("Recombination Rate\n(cM/Mb)",2,line=3,cex=1.5*0.75)
lines(set[["LG4"]]@interpolations$spline@physicalPositions,
set[["LG4"]]@interpolations$spline@rates,type="l",col=rec.cols[2],lwd=2)
lines(set[["LG4"]]@interpolations$loess@physicalPositions,
set[["LG4"]]@interpolations$loess@rates,type="l",lwd=2,col=rec.cols[3])
legend("topleft",bty='n',col=rec.cols,
lwd=2,c("Sliding Window","Spline","Loess"),cex=1.5)
### Plot dmc results
max.complikes<-calc.max.complike(dmc.pattern)
max.likes<-max.complikes[[1]]
plot_range = range(max.likes)
plot(selSite, max.likes[grep("ind",names(max.likes))], type = "b",bty="l",
ylim = c(plot_range[1] - 50, plot_range[2] + 50),
xlim=range(fst.dfs[[1]]$BP[fst.dfs[[1]]$Chr %in% chrom]),
xlab = "x.lab",ylab = "",cex=2,las=1,cex.axis=1.75,xaxt='n')
axis(1,at=seq(0,1.8*10^7,3*10^6),labels = seq(0,18,3),cex.axis=1.75)
lines(selSite, max.likes[grep("mig\\d+",names(max.likes))],cex=2, col = "#d7191c",#red
type = "b")
lines(selSite, max.likes[grep("sv\\d+",names(max.likes))],cex=2, col = "#2b83ba",#blue
type = "b")
lines(selSite, max.likes[grep("migInd",names(max.likes))],cex=2,#orange
col = "#fdae61", lty = 2, type = "b")
lines(selSite, max.likes[grep("svInd",names(max.likes))],cex=2,#green
col = "#91cf60", lty = 2, type = "b")
legend("topright", col = c("black", "#d7191c", "#2b83ba", "#fdae61", "#abdda4"),
lty = c(rep(1, 3), rep(2, 2)), sapply(1 : 5, function(i) paste("Model", i)),
bty='n',cex=1.5,lwd=2)
mtext("Composite Log Likelihood",2,line=3.75,cex=1.5*0.75)
mtext("Position on LG4 (Mb)",1,line=2.2,cex=1.5*0.75)
abline(v=mcle_ind[1],lty=2,lwd=2,col="black",xpd=FALSE)
abline(v=mcle_sv[1],lty=2,lwd=2,col="#2b83ba",xpd=FALSE)
abline(v=mcle_mixed_svInd[1],lty=2,lwd=2,col="#91cf60",xpd=FALSE)
### add legend
#create the legend text, and parameters
lgnd<-unlist(lapply(names(fst.dfs),function(n) { bquote(.(n)~italic(F[ST])) }))
lgnd<-c(lgnd,expression(delta~-divergence),"Jost's D","gene regions",
expression("Shared"~italic(F[ST])~"outlier"),"Salinity-associated")
pchs<-c(rep(32,6),15,124,25)
cols<-c(grp.colors[1],grp.colors[2],grp.colors[3],grp.colors[6],
"#dfc27d","#a6611a","indianred","black","black")
ltys=c(1,1,2,1,1,1,0,0,0)
lwds<-c(rep(2,6),0,0,1)
#plot the legend
par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0), mar=c(0, 0, 0, 0), new=TRUE,xpd=TRUE)
plot(c(0,1),c(0,1),bty='n',type='n',xlab="",ylab="",xaxt='n',yaxt='n')
legend("top",legend=lgnd,pch=pchs,ncol=5,
bty='n',lwd=lwds,lty=ltys,
col=cols,xpd=TRUE,cex=1.5,
x.intersp = 0.5,y.intersp = 0.75)
dev.off()
## png
## 2
LG 4 Figure
The loci that are the most likely targets of selection in models 3 and 5 are located at \(4.43784\times 10^{6}\) and \(4.645842\times 10^{6}\), respectively. Let’s use the gff file to annotate them.
#read in the gff if necessary
if(!("gff" %in% ls())){
if(length(grep("gz",gff.name))>0){
gff<-read.delim(gzfile(paste("../../scovelli_genome/",gff.name,sep="")),header=F)
} else{
gff<-read.delim(paste("../../scovelli_genome/",gff.name,sep=""),header=F)
}
colnames(gff)<-c("seqname","source","feature","start","end","score","strand","frame","attribute")
}
#get the blast matches
genome.blast<-read.csv("../../scovelli_genome/ssc_2016_12_20_cds_nr_blast_results.csv",skip=1)
#where are the loci?
gff[gff$seqname=="LG4" & gff$start <= mcle_sv[1] & gff$end >= mcle_sv[1],]
## seqname source feature start end score strand frame
## 321376 LG4 . contig 3641993 10355651 . . .
## attribute
## 321376 ID=scaffold_0;Name=scaffold_0
gff[gff$seqname=="LG4" & gff$start <= mcle_mixed_svInd[1] & gff$end >= mcle_mixed_svInd[1],]
## seqname source feature start end score strand frame
## 321376 LG4 . contig 3641993 10355651 . . .
## attribute
## 321376 ID=scaffold_0;Name=scaffold_0
dmc.genes<-data.frame(Locus.ID=c("Mod3","Mod5"),Chr=c("LG4","LG4"),BP=c(mcle_sv[1],mcle_mixed_svInd[1]),Column=c(1,1)) #format it appropriately
dmc.ann<-sig.region.ann(dmc.genes,gff,genome.blast)
kable(dmc.ann[,c("Locus","description")],row.names = FALSE)
| Locus | description |
|---|---|
| Mod3 | NA |
| Mod5 | NA |
Finally, I will write the putative gene regions to a file to be supplemental table 2.
#Provide more informative names
colnames(put.reg)<-c("Gene","Function","Chromosome_CitedSpeices","CitedSpecies","Citation","Scovelli_geneID",
"My_Annotation_Notes","Scovelli_Chrom","Scovelli_StartBP","Scovelli_StopBP",
"Contains_RAD_locus","Near_RAD_locus","Contains_DeltaDivergence","Near_DeltaDivergence",
"Contains_BayenvSalinity","Contains_BayenvTemp","Contains_BayenvSeagrass",
"Near_JostD","Contains_JostD","Contains_Fst")
write.table(put.reg,"putative_genes_annotated.txt",sep='\t',col.names = TRUE,row.names = FALSE,quote=FALSE)